mirror of
https://github.com/HumanAIGC/lite-avatar.git
synced 2026-02-05 09:59:18 +08:00
add files
This commit is contained in:
0
funasr_local/runtime/python/__init__.py
Normal file
0
funasr_local/runtime/python/__init__.py
Normal file
56
funasr_local/runtime/python/benchmark_libtorch.md
Normal file
56
funasr_local/runtime/python/benchmark_libtorch.md
Normal file
@@ -0,0 +1,56 @@
|
||||
# CPU Benchmark (Libtorch)
|
||||
|
||||
## Configuration
|
||||
### Data set:
|
||||
Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 36108.919 seconds.
|
||||
|
||||
### Tools
|
||||
#### Install Requirements
|
||||
Install ModelScope and FunASR
|
||||
```shell
|
||||
pip install -U modelscope funasr
|
||||
# For the users in China, you could install with the command:
|
||||
#pip install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
|
||||
```
|
||||
|
||||
Install requirements
|
||||
```shell
|
||||
git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
|
||||
cd funasr/runtime/python/utils
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
#### Recipe
|
||||
|
||||
##### test_rtf
|
||||
set the model, data path and output_dir
|
||||
```shell
|
||||
nohup bash test_rtf.sh &> log.txt &
|
||||
```
|
||||
|
||||
##### test_cer
|
||||
set the model, data path and output_dir
|
||||
```shell
|
||||
nohup bash test_cer.sh &> log.txt &
|
||||
```
|
||||
|
||||
|
||||
## [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
|
||||
|
||||
|
||||
### Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz 16core-32processor with avx512_vnni
|
||||
|
||||
| concurrent-tasks | processing time(s) | RTF | Speedup Rate |
|
||||
|:----------------:|:------------------:|:------:|:------------:|
|
||||
| 1 (torch fp32) | 3522 | 0.0976 | 10.3 |
|
||||
| 1 (torch int8) | 1746 | 0.0484 | 20.7 |
|
||||
| 32 (torch fp32) | 236 | 0.0066 | 152.7 |
|
||||
| 32 (torch int8) | 114 | 0.0032 | 317.4 |
|
||||
| 64 (torch fp32) | 235 | 0.0065 | 153.7 |
|
||||
| 64 (torch int8) | 113 | 0.0031 | 319.2 |
|
||||
|
||||
|
||||
[//]: # (### Intel(R) Xeon(R) Platinum 8163 CPU @ 2.50GHz 32core-64processor without avx512_vnni)
|
||||
|
||||
|
||||
## [Paraformer](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary)
|
||||
121
funasr_local/runtime/python/benchmark_onnx.md
Normal file
121
funasr_local/runtime/python/benchmark_onnx.md
Normal file
@@ -0,0 +1,121 @@
|
||||
# CPU Benchmark (ONNX-python)
|
||||
|
||||
## Configuration
|
||||
### Data set:
|
||||
Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 36108.919 seconds.
|
||||
|
||||
### Tools
|
||||
#### Install Requirements
|
||||
Install ModelScope and FunASR
|
||||
```shell
|
||||
pip install -U modelscope funasr
|
||||
# For the users in China, you could install with the command:
|
||||
#pip install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
|
||||
```
|
||||
|
||||
Install requirements
|
||||
```shell
|
||||
git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
|
||||
cd funasr/runtime/python/utils
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
#### Recipe
|
||||
|
||||
|
||||
##### test_rtf
|
||||
set the model, data path and output_dir
|
||||
```shell
|
||||
nohup bash test_rtf.sh &> log.txt &
|
||||
```
|
||||
|
||||
##### test_cer
|
||||
set the model, data path and output_dir
|
||||
```shell
|
||||
nohup bash test_cer.sh &> log.txt &
|
||||
```
|
||||
|
||||
## [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
|
||||
|
||||
Number of Parameter: 220M
|
||||
|
||||
Storage size: 880MB
|
||||
|
||||
Storage size after int8-quant: 237MB
|
||||
|
||||
CER: 1.95%
|
||||
|
||||
CER after int8-quant: 1.95%
|
||||
|
||||
### Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz 16core-32processor with avx512_vnni
|
||||
|
||||
| concurrent-tasks | processing time(s) | RTF | Speedup Rate |
|
||||
|:----------------:|:------------------:|:-------:|:------------:|
|
||||
| 1 (onnx fp32) | 2806 | 0.0777 | 12.9 |
|
||||
| 1 (onnx int8) | 1611 | 0.0446 | 22.4 |
|
||||
| 8 (onnx fp32) | 538 | 0.0149 | 67.1 |
|
||||
| 8 (onnx int8) | 210 | 0.0058 | 172.4 |
|
||||
| 16 (onnx fp32) | 288 | 0.0080 | 125.2 |
|
||||
| 16 (onnx int8) | 117 | 0.0032 | 309.9 |
|
||||
| 32 (onnx fp32) | 167 | 0.0046 | 216.5 |
|
||||
| 32 (onnx int8) | 86 | 0.0024 | 420.0 |
|
||||
| 64 (onnx fp32) | 158 | 0.0044 | 228.1 |
|
||||
| 64 (onnx int8) | 82 | 0.0023 | 442.8 |
|
||||
| 96 (onnx fp32) | 151 | 0.0042 | 238.0 |
|
||||
| 96 (onnx int8) | 80 | 0.0022 | 452.0 |
|
||||
|
||||
|
||||
### Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz 16core-32processor with avx512_vnni
|
||||
|
||||
| concurrent-tasks | processing time(s) | RTF | Speedup Rate |
|
||||
|:----------------:|:------------------:|:------:|:------------:|
|
||||
| 1 (onnx fp32) | 2613 | 0.0724 | 13.8 |
|
||||
| 1 (onnx int8) | 1321 | 0.0366 | 22.4 |
|
||||
| 32 (onnx fp32) | 170 | 0.0047 | 212.7 |
|
||||
| 32 (onnx int8) | 89 | 0.0025 | 407.0 |
|
||||
| 64 (onnx fp32) | 166 | 0.0046 | 217.1 |
|
||||
| 64 (onnx int8) | 87 | 0.0024 | 414.7 |
|
||||
|
||||
|
||||
### Intel(R) Xeon(R) Platinum 8163 CPU @ 2.50GHz 32core-64processor without avx512_vnni
|
||||
|
||||
|
||||
| concurrent-tasks | processing time(s) | RTF | Speedup Rate |
|
||||
|:----------------:|:------------------:|:------:|:------------:|
|
||||
| 1 (onnx fp32) | 2959 | 0.0820 | 12.2 |
|
||||
| 1 (onnx int8) | 2814 | 0.0778 | 12.8 |
|
||||
| 16 (onnx fp32) | 373 | 0.0103 | 96.9 |
|
||||
| 16 (onnx int8) | 331 | 0.0091 | 109.0 |
|
||||
| 32 (onnx fp32) | 211 | 0.0058 | 171.4 |
|
||||
| 32 (onnx int8) | 181 | 0.0050 | 200.0 |
|
||||
| 64 (onnx fp32) | 153 | 0.0042 | 235.9 |
|
||||
| 64 (onnx int8) | 103 | 0.0029 | 349.9 |
|
||||
| 96 (onnx fp32) | 146 | 0.0041 | 247.0 |
|
||||
| 96 (onnx int8) | 108 | 0.0030 | 334.1 |
|
||||
|
||||
## [Paraformer](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary)
|
||||
|
||||
Number of Parameter: 68M
|
||||
|
||||
Storage size: 275MB
|
||||
|
||||
Storage size after int8-quant: 81MB
|
||||
|
||||
CER: 3.73%
|
||||
|
||||
CER after int8-quant: 3.78%
|
||||
|
||||
### Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz 16core-32processor with avx512_vnni
|
||||
|
||||
| concurrent-tasks | processing time(s) | RTF | Speedup Rate |
|
||||
|:----------------:|:------------------:|:------:|:------------:|
|
||||
| 1 (onnx fp32) | 1173 | 0.0325 | 30.8 |
|
||||
| 1 (onnx int8) | 976 | 0.0270 | 37.0 |
|
||||
| 16 (onnx fp32) | 91 | 0.0025 | 395.2 |
|
||||
| 16 (onnx int8) | 78 | 0.0022 | 463.0 |
|
||||
| 32 (onnx fp32) | 60 | 0.0017 | 598.8 |
|
||||
| 32 (onnx int8) | 40 | 0.0011 | 892.9 |
|
||||
| 64 (onnx fp32) | 55 | 0.0015 | 653.6 |
|
||||
| 64 (onnx int8) | 31 | 0.0009 | 1162.8 |
|
||||
| 96 (onnx fp32) | 57 | 0.0016 | 632.9 |
|
||||
| 96 (onnx int8) | 33 | 0.0009 | 1098.9 |
|
||||
107
funasr_local/runtime/python/benchmark_onnx_cpp.md
Normal file
107
funasr_local/runtime/python/benchmark_onnx_cpp.md
Normal file
@@ -0,0 +1,107 @@
|
||||
# CPU Benchmark (ONNX-cpp)
|
||||
|
||||
## Configuration
|
||||
### Data set:
|
||||
Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 36108.919 seconds.
|
||||
|
||||
### Tools
|
||||
#### Install [modelscope and funasr](https://github.com/alibaba-damo-academy/FunASR#installation)
|
||||
|
||||
```shell
|
||||
pip3 install torch torchaudio
|
||||
pip install -U modelscope
|
||||
pip install -U funasr
|
||||
```
|
||||
|
||||
#### Export [onnx model](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export)
|
||||
|
||||
```shell
|
||||
python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True
|
||||
```
|
||||
|
||||
#### Building for Linux/Unix
|
||||
|
||||
Download onnxruntime
|
||||
```shell
|
||||
# download an appropriate onnxruntime from https://github.com/microsoft/onnxruntime/releases/tag/v1.14.0
|
||||
# here we get a copy of onnxruntime for linux 64
|
||||
wget https://github.com/microsoft/onnxruntime/releases/download/v1.14.0/onnxruntime-linux-x64-1.14.0.tgz
|
||||
tar -zxvf onnxruntime-linux-x64-1.14.0.tgz
|
||||
```
|
||||
|
||||
Install openblas
|
||||
```shell
|
||||
sudo apt-get install libopenblas-dev #ubuntu
|
||||
# sudo yum -y install openblas-devel #centos
|
||||
```
|
||||
|
||||
Build runtime
|
||||
```shell
|
||||
git clone https://github.com/alibaba-damo-academy/FunASR.git && cd funasr/runtime/onnxruntime
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=release .. -DONNXRUNTIME_DIR=/path/to/onnxruntime-linux-x64-1.14.0
|
||||
make
|
||||
```
|
||||
|
||||
#### Recipe
|
||||
|
||||
set the model, data path and output_dir
|
||||
|
||||
```shell
|
||||
./bin/funasr-onnx-offline-rtf /path/to/model_dir /path/to/wav.scp quantize(true or false) thread_num
|
||||
```
|
||||
|
||||
The structure of /path/to/models_dir
|
||||
```
|
||||
config.yaml, am.mvn, model.onnx(or model_quant.onnx)
|
||||
```
|
||||
|
||||
## [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
|
||||
|
||||
Number of Parameter: 220M
|
||||
|
||||
Storage size: 880MB
|
||||
|
||||
Storage size after int8-quant: 237MB
|
||||
|
||||
CER: 1.95%
|
||||
|
||||
CER after int8-quant: 1.95%
|
||||
|
||||
### Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz 16core-32processor with avx512_vnni
|
||||
|
||||
| concurrent-tasks | processing time(s) | RTF | Speedup Rate |
|
||||
|---------------------|:------------------:|:--------:|:------------:|
|
||||
| 1 (onnx fp32) | 2129s | 0.058974 | 17 |
|
||||
| 1 (onnx int8) | 1020s | 0.02826 | 35 |
|
||||
| 8 (onnx fp32) | 273s | 0.007553 | 132 |
|
||||
| 8 (onnx int8) | 128s | 0.003558 | 281 |
|
||||
| 16 (onnx fp32) | 146s | 0.00403 | 248 |
|
||||
| 16 (onnx int8) | 67s | 0.001868 | 535 |
|
||||
| 32 (onnx fp32) | 133s | 0.003672 | 272 |
|
||||
| 32 (onnx int8) | 64s | 0.001778 | 562 |
|
||||
| 64 (onnx fp32) | 136s | 0.003771 | 265 |
|
||||
| 64 (onnx int8) | 67s | 0.001846 | 541 |
|
||||
| 96 (onnx fp32) | 137s | 0.003788 | 264 |
|
||||
| 96 (onnx int8) | 68s | 0.001875 | 533 |
|
||||
|
||||
|
||||
|
||||
### Intel(R) Xeon(R) Platinum 8163 CPU @ 2.50GHz 32core-64processor without avx512_vnni
|
||||
|
||||
| concurrent-tasks | processing time(s) | RTF | Speedup Rate |
|
||||
|---------------------|--------------------|----------|--------------|
|
||||
| 1 (onnx fp32) | 2903s | 0.080404 | 12 |
|
||||
| 1 (onnx int8) | 2714s | 0.075168 | 13 |
|
||||
| 8 (onnx fp32) | 373s | 0.010329 | 97 |
|
||||
| 8 (onnx int8) | 340s | 0.009428 | 106 |
|
||||
| 16 (onnx fp32) | 189s | 0.005252 | 190 |
|
||||
| 16 (onnx int8) | 174s | 0.004817 | 207 |
|
||||
| 32 (onnx fp32) | 109s | 0.00301 | 332 |
|
||||
| 32 (onnx int8) | 88s | 0.00245 | 408 |
|
||||
| 64 (onnx fp32) | 113s | 0.003129 | 320 |
|
||||
| 64 (onnx int8) | 79s | 0.002201 | 454 |
|
||||
| 96 (onnx fp32) | 115s | 0.003183 | 314 |
|
||||
| 96 (onnx int8) | 80s | 0.002222 | 450 |
|
||||
|
||||
|
||||
1
funasr_local/runtime/python/grpc/.gitignore
vendored
Normal file
1
funasr_local/runtime/python/grpc/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
**/__pycache__
|
||||
95
funasr_local/runtime/python/grpc/Readme.md
Normal file
95
funasr_local/runtime/python/grpc/Readme.md
Normal file
@@ -0,0 +1,95 @@
|
||||
# Service with grpc-python
|
||||
We can send streaming audio data to server in real-time with grpc client every 10 ms e.g., and get transcribed text when stop speaking.
|
||||
The audio data is in streaming, the asr inference process is in offline.
|
||||
|
||||
## For the Server
|
||||
|
||||
### Prepare server environment
|
||||
#### Backend is modelscope pipeline (default)
|
||||
Install the modelscope and funasr
|
||||
|
||||
```shell
|
||||
pip install -U modelscope funasr
|
||||
# For the users in China, you could install with the command:
|
||||
# pip install -U modelscope funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
|
||||
git clone https://github.com/alibaba/FunASR.git && cd FunASR
|
||||
```
|
||||
|
||||
Install the requirements
|
||||
|
||||
```shell
|
||||
cd funasr/runtime/python/grpc
|
||||
pip install -r requirements_server.txt
|
||||
```
|
||||
|
||||
#### Backend is funasr_onnx (optional)
|
||||
|
||||
Install [`funasr_onnx`](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).
|
||||
|
||||
```
|
||||
pip install funasr_onnx -i https://pypi.Python.org/simple
|
||||
```
|
||||
|
||||
Export the model, more details ref to [export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).
|
||||
```shell
|
||||
python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True
|
||||
```
|
||||
|
||||
### Generate protobuf file
|
||||
Run on server, the two generated pb files are both used for server and client
|
||||
|
||||
```shell
|
||||
# paraformer_pb2.py and paraformer_pb2_grpc.py are already generated,
|
||||
# regenerate it only when you make changes to ./proto/paraformer.proto file.
|
||||
python -m grpc_tools.protoc --proto_path=./proto -I ./proto --python_out=. --grpc_python_out=./ ./proto/paraformer.proto
|
||||
```
|
||||
|
||||
### Start grpc server
|
||||
|
||||
```
|
||||
# Start server.
|
||||
python grpc_main_server.py --port 10095 --backend pipeline
|
||||
```
|
||||
|
||||
If you want run server with onnxruntime, please set `backend` and `onnx_dir`.
|
||||
```
|
||||
# Start server.
|
||||
python grpc_main_server.py --port 10095 --backend onnxruntime --onnx_dir /models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
|
||||
```
|
||||
|
||||
## For the client
|
||||
|
||||
### Install the requirements
|
||||
|
||||
```shell
|
||||
git clone https://github.com/alibaba/FunASR.git && cd FunASR
|
||||
cd funasr/runtime/python/grpc
|
||||
pip install -r requirements_client.txt
|
||||
```
|
||||
|
||||
### Generate protobuf file
|
||||
Run on server, the two generated pb files are both used for server and client
|
||||
|
||||
```shell
|
||||
# paraformer_pb2.py and paraformer_pb2_grpc.py are already generated,
|
||||
# regenerate it only when you make changes to ./proto/paraformer.proto file.
|
||||
python -m grpc_tools.protoc --proto_path=./proto -I ./proto --python_out=. --grpc_python_out=./ ./proto/paraformer.proto
|
||||
```
|
||||
|
||||
### Start grpc client
|
||||
```
|
||||
# Start client.
|
||||
python grpc_main_client_mic.py --host 127.0.0.1 --port 10095
|
||||
```
|
||||
|
||||
|
||||
## Workflow in desgin
|
||||
|
||||
<div align="left"><img src="proto/workflow.png" width="400"/>
|
||||
|
||||
## Reference
|
||||
We borrow from or refer to some code as:
|
||||
|
||||
1)https://github.com/wenet-e2e/wenet/tree/main/runtime/core/grpc
|
||||
|
||||
2)https://github.com/Open-Speech-EkStep/inference_service/blob/main/realtime_inference_service.py
|
||||
17
funasr_local/runtime/python/grpc/grpc_client.py
Normal file
17
funasr_local/runtime/python/grpc/grpc_client.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import queue
|
||||
import paraformer_pb2
|
||||
|
||||
def transcribe_audio_bytes(stub, chunk, user='zksz', language='zh-CN', speaking = True, isEnd = False):
|
||||
req = paraformer_pb2.Request()
|
||||
if chunk is not None:
|
||||
req.audio_data = chunk
|
||||
req.user = user
|
||||
req.language = language
|
||||
req.speaking = speaking
|
||||
req.isEnd = isEnd
|
||||
my_queue = queue.SimpleQueue()
|
||||
my_queue.put(req)
|
||||
return stub.Recognize(iter(my_queue.get, None))
|
||||
|
||||
|
||||
|
||||
62
funasr_local/runtime/python/grpc/grpc_main_client.py
Normal file
62
funasr_local/runtime/python/grpc/grpc_main_client.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import grpc
|
||||
import json
|
||||
import time
|
||||
import asyncio
|
||||
import soundfile as sf
|
||||
import argparse
|
||||
|
||||
from grpc_client import transcribe_audio_bytes
|
||||
from paraformer_pb2_grpc import ASRStub
|
||||
|
||||
# send the audio data once
|
||||
async def grpc_rec(wav_scp, grpc_uri, asr_user, language):
|
||||
with grpc.insecure_channel(grpc_uri) as channel:
|
||||
stub = ASRStub(channel)
|
||||
for line in wav_scp:
|
||||
wav_file = line.split()[1]
|
||||
wav, _ = sf.read(wav_file, dtype='int16')
|
||||
|
||||
b = time.time()
|
||||
response = transcribe_audio_bytes(stub, wav.tobytes(), user=asr_user, language=language, speaking=False, isEnd=False)
|
||||
resp = response.next()
|
||||
text = ''
|
||||
if 'decoding' == resp.action:
|
||||
resp = response.next()
|
||||
if 'finish' == resp.action:
|
||||
text = json.loads(resp.sentence)['text']
|
||||
response = transcribe_audio_bytes(stub, None, user=asr_user, language=language, speaking=False, isEnd=True)
|
||||
res= {'text': text, 'time': time.time() - b}
|
||||
print(res)
|
||||
|
||||
async def test(args):
|
||||
wav_scp = open(args.wav_scp, "r").readlines()
|
||||
uri = '{}:{}'.format(args.host, args.port)
|
||||
res = await grpc_rec(wav_scp, uri, args.user_allowed, language = 'zh-CN')
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host",
|
||||
type=str,
|
||||
default="127.0.0.1",
|
||||
required=False,
|
||||
help="grpc server host ip")
|
||||
parser.add_argument("--port",
|
||||
type=int,
|
||||
default=10108,
|
||||
required=False,
|
||||
help="grpc server port")
|
||||
parser.add_argument("--user_allowed",
|
||||
type=str,
|
||||
default="project1_user1",
|
||||
help="allowed user for grpc client")
|
||||
parser.add_argument("--sample_rate",
|
||||
type=int,
|
||||
default=16000,
|
||||
help="audio sample_rate from client")
|
||||
parser.add_argument("--wav_scp",
|
||||
type=str,
|
||||
required=True,
|
||||
help="audio wav scp")
|
||||
args = parser.parse_args()
|
||||
|
||||
asyncio.run(test(args))
|
||||
112
funasr_local/runtime/python/grpc/grpc_main_client_mic.py
Normal file
112
funasr_local/runtime/python/grpc/grpc_main_client_mic.py
Normal file
@@ -0,0 +1,112 @@
|
||||
import pyaudio
|
||||
import grpc
|
||||
import json
|
||||
import webrtcvad
|
||||
import time
|
||||
import asyncio
|
||||
import argparse
|
||||
|
||||
from grpc_client import transcribe_audio_bytes
|
||||
from paraformer_pb2_grpc import ASRStub
|
||||
|
||||
async def deal_chunk(sig_mic):
|
||||
global stub,SPEAKING,asr_user,language,sample_rate
|
||||
if vad.is_speech(sig_mic, sample_rate): #speaking
|
||||
SPEAKING = True
|
||||
response = transcribe_audio_bytes(stub, sig_mic, user=asr_user, language=language, speaking = True, isEnd = False) #speaking, send audio to server.
|
||||
else: #silence
|
||||
begin_time = 0
|
||||
if SPEAKING: #means we have some audio recorded, send recognize order to server.
|
||||
SPEAKING = False
|
||||
begin_time = int(round(time.time() * 1000))
|
||||
response = transcribe_audio_bytes(stub, None, user=asr_user, language=language, speaking = False, isEnd = False) #speak end, call server for recognize one sentence
|
||||
resp = response.next()
|
||||
if "decoding" == resp.action:
|
||||
resp = response.next() #TODO, blocking operation may leads to miss some audio clips. C++ multi-threading is preferred.
|
||||
if "finish" == resp.action:
|
||||
end_time = int(round(time.time() * 1000))
|
||||
print (json.loads(resp.sentence))
|
||||
print ("delay in ms: %d " % (end_time - begin_time))
|
||||
else:
|
||||
pass
|
||||
|
||||
|
||||
async def record(host,port,sample_rate,mic_chunk,record_seconds,asr_user,language):
|
||||
with grpc.insecure_channel('{}:{}'.format(host, port)) as channel:
|
||||
global stub
|
||||
stub = ASRStub(channel)
|
||||
for i in range(0, int(sample_rate / mic_chunk * record_seconds)):
|
||||
|
||||
sig_mic = stream.read(mic_chunk,exception_on_overflow = False)
|
||||
await asyncio.create_task(deal_chunk(sig_mic))
|
||||
|
||||
#end grpc
|
||||
response = transcribe_audio_bytes(stub, None, user=asr_user, language=language, speaking = False, isEnd = True)
|
||||
print (response.next().action)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host",
|
||||
type=str,
|
||||
default="127.0.0.1",
|
||||
required=True,
|
||||
help="grpc server host ip")
|
||||
|
||||
parser.add_argument("--port",
|
||||
type=int,
|
||||
default=10095,
|
||||
required=True,
|
||||
help="grpc server port")
|
||||
|
||||
parser.add_argument("--user_allowed",
|
||||
type=str,
|
||||
default="project1_user1",
|
||||
help="allowed user for grpc client")
|
||||
|
||||
parser.add_argument("--sample_rate",
|
||||
type=int,
|
||||
default=16000,
|
||||
help="audio sample_rate from client")
|
||||
|
||||
parser.add_argument("--mic_chunk",
|
||||
type=int,
|
||||
default=160,
|
||||
help="chunk size for mic")
|
||||
|
||||
parser.add_argument("--record_seconds",
|
||||
type=int,
|
||||
default=120,
|
||||
help="run specified seconds then exit ")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
SPEAKING = False
|
||||
asr_user = args.user_allowed
|
||||
sample_rate = args.sample_rate
|
||||
language = 'zh-CN'
|
||||
|
||||
|
||||
vad = webrtcvad.Vad()
|
||||
vad.set_mode(1)
|
||||
|
||||
FORMAT = pyaudio.paInt16
|
||||
CHANNELS = 1
|
||||
p = pyaudio.PyAudio()
|
||||
|
||||
stream = p.open(format=FORMAT,
|
||||
channels=CHANNELS,
|
||||
rate=args.sample_rate,
|
||||
input=True,
|
||||
frames_per_buffer=args.mic_chunk)
|
||||
|
||||
print("* recording")
|
||||
asyncio.run(record(args.host,args.port,args.sample_rate,args.mic_chunk,args.record_seconds,args.user_allowed,language))
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
print("recording stop")
|
||||
|
||||
|
||||
|
||||
68
funasr_local/runtime/python/grpc/grpc_main_server.py
Normal file
68
funasr_local/runtime/python/grpc/grpc_main_server.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import grpc
|
||||
from concurrent import futures
|
||||
import argparse
|
||||
|
||||
import paraformer_pb2_grpc
|
||||
from grpc_server import ASRServicer
|
||||
|
||||
def serve(args):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=10),
|
||||
# interceptors=(AuthInterceptor('Bearer mysecrettoken'),)
|
||||
)
|
||||
paraformer_pb2_grpc.add_ASRServicer_to_server(
|
||||
ASRServicer(args.user_allowed, args.model, args.sample_rate, args.backend, args.onnx_dir, vad_model=args.vad_model, punc_model=args.punc_model), server)
|
||||
port = "[::]:" + str(args.port)
|
||||
server.add_insecure_port(port)
|
||||
server.start()
|
||||
print("grpc server started!")
|
||||
server.wait_for_termination()
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--port",
|
||||
type=int,
|
||||
default=10095,
|
||||
required=True,
|
||||
help="grpc server port")
|
||||
|
||||
parser.add_argument("--user_allowed",
|
||||
type=str,
|
||||
default="project1_user1|project1_user2|project2_user3",
|
||||
help="allowed user for grpc client")
|
||||
|
||||
parser.add_argument("--model",
|
||||
type=str,
|
||||
default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
||||
help="model from modelscope")
|
||||
parser.add_argument("--vad_model",
|
||||
type=str,
|
||||
default="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
||||
help="model from modelscope")
|
||||
|
||||
parser.add_argument("--punc_model",
|
||||
type=str,
|
||||
default="",
|
||||
help="model from modelscope")
|
||||
|
||||
parser.add_argument("--sample_rate",
|
||||
type=int,
|
||||
default=16000,
|
||||
help="audio sample_rate from client")
|
||||
|
||||
parser.add_argument("--backend",
|
||||
type=str,
|
||||
default="pipeline",
|
||||
choices=("pipeline", "onnxruntime"),
|
||||
help="backend, optional modelscope pipeline or onnxruntime")
|
||||
|
||||
parser.add_argument("--onnx_dir",
|
||||
type=str,
|
||||
default="/nfs/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
||||
help="onnx model dir")
|
||||
|
||||
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
serve(args)
|
||||
132
funasr_local/runtime/python/grpc/grpc_server.py
Normal file
132
funasr_local/runtime/python/grpc/grpc_server.py
Normal file
@@ -0,0 +1,132 @@
|
||||
from concurrent import futures
|
||||
import grpc
|
||||
import json
|
||||
import time
|
||||
|
||||
import paraformer_pb2_grpc
|
||||
from paraformer_pb2 import Response
|
||||
|
||||
|
||||
class ASRServicer(paraformer_pb2_grpc.ASRServicer):
|
||||
def __init__(self, user_allowed, model, sample_rate, backend, onnx_dir, vad_model='', punc_model=''):
|
||||
print("ASRServicer init")
|
||||
self.backend = backend
|
||||
self.init_flag = 0
|
||||
self.client_buffers = {}
|
||||
self.client_transcription = {}
|
||||
self.auth_user = user_allowed.split("|")
|
||||
if self.backend == "pipeline":
|
||||
try:
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
except ImportError:
|
||||
raise ImportError(f"Please install modelscope")
|
||||
self.inference_16k_pipeline = pipeline(task=Tasks.auto_speech_recognition, model=model, vad_model=vad_model, punc_model=punc_model)
|
||||
elif self.backend == "onnxruntime":
|
||||
try:
|
||||
from funasr_local_onnx import Paraformer
|
||||
except ImportError:
|
||||
raise ImportError(f"Please install onnxruntime environment")
|
||||
self.inference_16k_pipeline = Paraformer(model_dir=onnx_dir)
|
||||
self.sample_rate = sample_rate
|
||||
|
||||
def clear_states(self, user):
|
||||
self.clear_buffers(user)
|
||||
self.clear_transcriptions(user)
|
||||
|
||||
def clear_buffers(self, user):
|
||||
if user in self.client_buffers:
|
||||
del self.client_buffers[user]
|
||||
|
||||
def clear_transcriptions(self, user):
|
||||
if user in self.client_transcription:
|
||||
del self.client_transcription[user]
|
||||
|
||||
def disconnect(self, user):
|
||||
self.clear_states(user)
|
||||
print("Disconnecting user: %s" % str(user))
|
||||
|
||||
def Recognize(self, request_iterator, context):
|
||||
|
||||
|
||||
for req in request_iterator:
|
||||
if req.user not in self.auth_user:
|
||||
result = {}
|
||||
result["success"] = False
|
||||
result["detail"] = "Not Authorized user: %s " % req.user
|
||||
result["text"] = ""
|
||||
yield Response(sentence=json.dumps(result), user=req.user, action="terminate", language=req.language)
|
||||
elif req.isEnd: #end grpc
|
||||
print("asr end")
|
||||
self.disconnect(req.user)
|
||||
result = {}
|
||||
result["success"] = True
|
||||
result["detail"] = "asr end"
|
||||
result["text"] = ""
|
||||
yield Response(sentence=json.dumps(result), user=req.user, action="terminate",language=req.language)
|
||||
elif req.speaking: #continue speaking
|
||||
if req.audio_data is not None and len(req.audio_data) > 0:
|
||||
if req.user in self.client_buffers:
|
||||
self.client_buffers[req.user] += req.audio_data #append audio
|
||||
else:
|
||||
self.client_buffers[req.user] = req.audio_data
|
||||
result = {}
|
||||
result["success"] = True
|
||||
result["detail"] = "speaking"
|
||||
result["text"] = ""
|
||||
yield Response(sentence=json.dumps(result), user=req.user, action="speaking", language=req.language)
|
||||
elif not req.speaking: #silence
|
||||
if req.user not in self.client_buffers:
|
||||
result = {}
|
||||
result["success"] = True
|
||||
result["detail"] = "waiting_for_more_voice"
|
||||
result["text"] = ""
|
||||
yield Response(sentence=json.dumps(result), user=req.user, action="waiting", language=req.language)
|
||||
else:
|
||||
begin_time = int(round(time.time() * 1000))
|
||||
tmp_data = self.client_buffers[req.user]
|
||||
self.clear_states(req.user)
|
||||
result = {}
|
||||
result["success"] = True
|
||||
result["detail"] = "decoding data: %d bytes" % len(tmp_data)
|
||||
result["text"] = ""
|
||||
yield Response(sentence=json.dumps(result), user=req.user, action="decoding", language=req.language)
|
||||
if len(tmp_data) < 9600: #min input_len for asr model , 300ms
|
||||
end_time = int(round(time.time() * 1000))
|
||||
delay_str = str(end_time - begin_time)
|
||||
result = {}
|
||||
result["success"] = True
|
||||
result["detail"] = "waiting_for_more_voice"
|
||||
result["server_delay_ms"] = delay_str
|
||||
result["text"] = ""
|
||||
print ("user: %s , delay(ms): %s, info: %s " % (req.user, delay_str, "waiting_for_more_voice"))
|
||||
yield Response(sentence=json.dumps(result), user=req.user, action="waiting", language=req.language)
|
||||
else:
|
||||
if self.backend == "pipeline":
|
||||
asr_result = self.inference_16k_pipeline(audio_in=tmp_data, audio_fs = self.sample_rate)
|
||||
if "text" in asr_result:
|
||||
asr_result = asr_result['text']
|
||||
else:
|
||||
asr_result = ""
|
||||
elif self.backend == "onnxruntime":
|
||||
from funasr_local_onnx.utils.frontend import load_bytes
|
||||
array = load_bytes(tmp_data)
|
||||
asr_result = self.inference_16k_pipeline(array)[0]
|
||||
end_time = int(round(time.time() * 1000))
|
||||
delay_str = str(end_time - begin_time)
|
||||
print ("user: %s , delay(ms): %s, text: %s " % (req.user, delay_str, asr_result))
|
||||
result = {}
|
||||
result["success"] = True
|
||||
result["detail"] = "finish_sentence"
|
||||
result["server_delay_ms"] = delay_str
|
||||
result["text"] = asr_result
|
||||
yield Response(sentence=json.dumps(result), user=req.user, action="finish", language=req.language)
|
||||
else:
|
||||
result = {}
|
||||
result["success"] = False
|
||||
result["detail"] = "error, no condition matched! Unknown reason."
|
||||
result["text"] = ""
|
||||
self.disconnect(req.user)
|
||||
yield Response(sentence=json.dumps(result), user=req.user, action="terminate", language=req.language)
|
||||
|
||||
|
||||
30
funasr_local/runtime/python/grpc/paraformer_pb2.py
Normal file
30
funasr_local/runtime/python/grpc/paraformer_pb2.py
Normal file
@@ -0,0 +1,30 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# source: paraformer.proto
|
||||
"""Generated protocol buffer code."""
|
||||
from google.protobuf.internal import builder as _builder
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
|
||||
|
||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x10paraformer.proto\x12\nparaformer\"^\n\x07Request\x12\x12\n\naudio_data\x18\x01 \x01(\x0c\x12\x0c\n\x04user\x18\x02 \x01(\t\x12\x10\n\x08language\x18\x03 \x01(\t\x12\x10\n\x08speaking\x18\x04 \x01(\x08\x12\r\n\x05isEnd\x18\x05 \x01(\x08\"L\n\x08Response\x12\x10\n\x08sentence\x18\x01 \x01(\t\x12\x0c\n\x04user\x18\x02 \x01(\t\x12\x10\n\x08language\x18\x03 \x01(\t\x12\x0e\n\x06\x61\x63tion\x18\x04 \x01(\t2C\n\x03\x41SR\x12<\n\tRecognize\x12\x13.paraformer.Request\x1a\x14.paraformer.Response\"\x00(\x01\x30\x01\x42\x16\n\x07\x65x.grpc\xa2\x02\nparaformerb\x06proto3')
|
||||
|
||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
|
||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'paraformer_pb2', globals())
|
||||
if _descriptor._USE_C_DESCRIPTORS == False:
|
||||
|
||||
DESCRIPTOR._options = None
|
||||
DESCRIPTOR._serialized_options = b'\n\007ex.grpc\242\002\nparaformer'
|
||||
_REQUEST._serialized_start=32
|
||||
_REQUEST._serialized_end=126
|
||||
_RESPONSE._serialized_start=128
|
||||
_RESPONSE._serialized_end=204
|
||||
_ASR._serialized_start=206
|
||||
_ASR._serialized_end=273
|
||||
# @@protoc_insertion_point(module_scope)
|
||||
66
funasr_local/runtime/python/grpc/paraformer_pb2_grpc.py
Normal file
66
funasr_local/runtime/python/grpc/paraformer_pb2_grpc.py
Normal file
@@ -0,0 +1,66 @@
|
||||
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
|
||||
"""Client and server classes corresponding to protobuf-defined services."""
|
||||
import grpc
|
||||
|
||||
import paraformer_pb2 as paraformer__pb2
|
||||
|
||||
|
||||
class ASRStub(object):
|
||||
"""Missing associated documentation comment in .proto file."""
|
||||
|
||||
def __init__(self, channel):
|
||||
"""Constructor.
|
||||
|
||||
Args:
|
||||
channel: A grpc.Channel.
|
||||
"""
|
||||
self.Recognize = channel.stream_stream(
|
||||
'/paraformer.ASR/Recognize',
|
||||
request_serializer=paraformer__pb2.Request.SerializeToString,
|
||||
response_deserializer=paraformer__pb2.Response.FromString,
|
||||
)
|
||||
|
||||
|
||||
class ASRServicer(object):
|
||||
"""Missing associated documentation comment in .proto file."""
|
||||
|
||||
def Recognize(self, request_iterator, context):
|
||||
"""Missing associated documentation comment in .proto file."""
|
||||
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||
context.set_details('Method not implemented!')
|
||||
raise NotImplementedError('Method not implemented!')
|
||||
|
||||
|
||||
def add_ASRServicer_to_server(servicer, server):
|
||||
rpc_method_handlers = {
|
||||
'Recognize': grpc.stream_stream_rpc_method_handler(
|
||||
servicer.Recognize,
|
||||
request_deserializer=paraformer__pb2.Request.FromString,
|
||||
response_serializer=paraformer__pb2.Response.SerializeToString,
|
||||
),
|
||||
}
|
||||
generic_handler = grpc.method_handlers_generic_handler(
|
||||
'paraformer.ASR', rpc_method_handlers)
|
||||
server.add_generic_rpc_handlers((generic_handler,))
|
||||
|
||||
|
||||
# This class is part of an EXPERIMENTAL API.
|
||||
class ASR(object):
|
||||
"""Missing associated documentation comment in .proto file."""
|
||||
|
||||
@staticmethod
|
||||
def Recognize(request_iterator,
|
||||
target,
|
||||
options=(),
|
||||
channel_credentials=None,
|
||||
call_credentials=None,
|
||||
insecure=False,
|
||||
compression=None,
|
||||
wait_for_ready=None,
|
||||
timeout=None,
|
||||
metadata=None):
|
||||
return grpc.experimental.stream_stream(request_iterator, target, '/paraformer.ASR/Recognize',
|
||||
paraformer__pb2.Request.SerializeToString,
|
||||
paraformer__pb2.Response.FromString,
|
||||
options, channel_credentials,
|
||||
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
|
||||
25
funasr_local/runtime/python/grpc/proto/Readme.md
Normal file
25
funasr_local/runtime/python/grpc/proto/Readme.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```
|
||||
service ASR { //grpc service
|
||||
rpc Recognize (stream Request) returns (stream Response) {} //Stub
|
||||
}
|
||||
|
||||
message Request { //request data
|
||||
bytes audio_data = 1; //audio data in bytes.
|
||||
string user = 2; //user allowed.
|
||||
string language = 3; //language, zh-CN for now.
|
||||
bool speaking = 4; //flag for speaking.
|
||||
bool isEnd = 5; //flag for end. set isEnd to true when you stop asr:
|
||||
//vad:is_speech then speaking=True & isEnd = False, audio data will be appended for the specfied user.
|
||||
//vad:silence then speaking=False & isEnd = False, clear audio buffer and do asr inference.
|
||||
}
|
||||
|
||||
message Response { //response data.
|
||||
string sentence = 1; //json, includes flag for success and asr text .
|
||||
string user = 2; //same to request user.
|
||||
string language = 3; //same to request language.
|
||||
string action = 4; //server status:
|
||||
//terminate:asr stopped;
|
||||
//speaking:user is speaking, audio data is appended;
|
||||
//decoding: server is decoding;
|
||||
//finish: get asr text, most used.
|
||||
}
|
||||
38
funasr_local/runtime/python/grpc/proto/paraformer.proto
Normal file
38
funasr_local/runtime/python/grpc/proto/paraformer.proto
Normal file
@@ -0,0 +1,38 @@
|
||||
// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu)
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
syntax = "proto3";
|
||||
|
||||
option java_package = "ex.grpc";
|
||||
option objc_class_prefix = "paraformer";
|
||||
|
||||
package paraformer;
|
||||
|
||||
service ASR {
|
||||
rpc Recognize (stream Request) returns (stream Response) {}
|
||||
}
|
||||
|
||||
message Request {
|
||||
bytes audio_data = 1;
|
||||
string user = 2;
|
||||
string language = 3;
|
||||
bool speaking = 4;
|
||||
bool isEnd = 5;
|
||||
}
|
||||
|
||||
message Response {
|
||||
string sentence = 1;
|
||||
string user = 2;
|
||||
string language = 3;
|
||||
string action = 4;
|
||||
}
|
||||
BIN
funasr_local/runtime/python/grpc/proto/workflow.png
Normal file
BIN
funasr_local/runtime/python/grpc/proto/workflow.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 64 KiB |
4
funasr_local/runtime/python/grpc/requirements_client.txt
Normal file
4
funasr_local/runtime/python/grpc/requirements_client.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
pyaudio
|
||||
webrtcvad
|
||||
grpcio
|
||||
grpcio-tools
|
||||
2
funasr_local/runtime/python/grpc/requirements_server.txt
Normal file
2
funasr_local/runtime/python/grpc/requirements_server.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
grpcio
|
||||
grpcio-tools
|
||||
73
funasr_local/runtime/python/libtorch/README.md
Normal file
73
funasr_local/runtime/python/libtorch/README.md
Normal file
@@ -0,0 +1,73 @@
|
||||
# Libtorch-python
|
||||
|
||||
## Export the model
|
||||
### Install [modelscope and funasr](https://github.com/alibaba-damo-academy/FunASR#installation)
|
||||
|
||||
```shell
|
||||
# pip3 install torch torchaudio
|
||||
pip install -U modelscope funasr
|
||||
# For the users in China, you could install with the command:
|
||||
# pip install -U modelscope funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
|
||||
pip install torch-quant # Optional, for torchscript quantization
|
||||
pip install onnx onnxruntime # Optional, for onnx quantization
|
||||
```
|
||||
|
||||
### Export [onnx model](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export)
|
||||
|
||||
```shell
|
||||
python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type torch --quantize True
|
||||
```
|
||||
|
||||
## Install the `funasr_torch`.
|
||||
|
||||
install from pip
|
||||
```shell
|
||||
pip install -U funasr_torch
|
||||
# For the users in China, you could install with the command:
|
||||
# pip install -U funasr_torch -i https://mirror.sjtu.edu.cn/pypi/web/simple
|
||||
```
|
||||
or install from source code
|
||||
|
||||
```shell
|
||||
git clone https://github.com/alibaba/FunASR.git && cd FunASR
|
||||
cd funasr/runtime/python/libtorch
|
||||
pip install -e ./
|
||||
# For the users in China, you could install with the command:
|
||||
# pip install -e ./ -i https://mirror.sjtu.edu.cn/pypi/web/simple
|
||||
```
|
||||
|
||||
## Run the demo.
|
||||
- Model_dir: the model path, which contains `model.torchscripts`, `config.yaml`, `am.mvn`.
|
||||
- Input: wav formt file, support formats: `str, np.ndarray, List[str]`
|
||||
- Output: `List[str]`: recognition result.
|
||||
- Example:
|
||||
```python
|
||||
from funasr_torch import Paraformer
|
||||
|
||||
model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||
model = Paraformer(model_dir, batch_size=1)
|
||||
|
||||
wav_path = ['/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
|
||||
|
||||
result = model(wav_path)
|
||||
print(result)
|
||||
```
|
||||
|
||||
## Performance benchmark
|
||||
|
||||
Please ref to [benchmark](https://github.com/alibaba-damo-academy/FunASR/blob/main/funasr/runtime/python/benchmark_libtorch.md)
|
||||
|
||||
## Speed
|
||||
|
||||
Environment:Intel(R) Xeon(R) Platinum 8163 CPU @ 2.50GHz
|
||||
|
||||
Test [wav, 5.53s, 100 times avg.](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav)
|
||||
|
||||
| Backend | RTF (FP32) |
|
||||
|:--------:|:----------:|
|
||||
| Pytorch | 0.110 |
|
||||
| Libtorch | 0.048 |
|
||||
| Onnx | 0.038 |
|
||||
|
||||
## Acknowledge
|
||||
This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
|
||||
0
funasr_local/runtime/python/libtorch/__init__.py
Normal file
0
funasr_local/runtime/python/libtorch/__init__.py
Normal file
15
funasr_local/runtime/python/libtorch/demo.py
Normal file
15
funasr_local/runtime/python/libtorch/demo.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from funasr_local_torch import Paraformer
|
||||
|
||||
|
||||
model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||
|
||||
model = Paraformer(model_dir, batch_size=1) # cpu
|
||||
# model = Paraformer(model_dir, batch_size=1, device_id=0) # gpu
|
||||
|
||||
# when using paraformer-large-vad-punc model, you can set plot_timestamp_to="./xx.png" to get figure of alignment besides timestamps
|
||||
# model = Paraformer(model_dir, batch_size=1, plot_timestamp_to="test.png")
|
||||
|
||||
wav_path = "YourPath/xx.wav"
|
||||
|
||||
result = model(wav_path)
|
||||
print(result)
|
||||
@@ -0,0 +1,2 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
from .paraformer_bin import Paraformer
|
||||
@@ -0,0 +1,197 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
import os.path
|
||||
from pathlib import Path
|
||||
from typing import List, Union, Tuple
|
||||
|
||||
import copy
|
||||
import librosa
|
||||
import numpy as np
|
||||
|
||||
from .utils.utils import (CharTokenizer, Hypothesis,
|
||||
TokenIDConverter, get_logger,
|
||||
read_yaml)
|
||||
from .utils.postprocess_utils import sentence_postprocess
|
||||
from .utils.frontend import WavFrontend
|
||||
from .utils.timestamp_utils import time_stamp_lfr6_onnx
|
||||
logging = get_logger()
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
class Paraformer():
|
||||
def __init__(self, model_dir: Union[str, Path] = None,
|
||||
batch_size: int = 1,
|
||||
device_id: Union[str, int] = "-1",
|
||||
plot_timestamp_to: str = "",
|
||||
quantize: bool = False,
|
||||
intra_op_num_threads: int = 1,
|
||||
):
|
||||
|
||||
if not Path(model_dir).exists():
|
||||
raise FileNotFoundError(f'{model_dir} does not exist.')
|
||||
|
||||
model_file = os.path.join(model_dir, 'model.torchscripts')
|
||||
if quantize:
|
||||
model_file = os.path.join(model_dir, 'model_quant.torchscripts')
|
||||
config_file = os.path.join(model_dir, 'config.yaml')
|
||||
cmvn_file = os.path.join(model_dir, 'am.mvn')
|
||||
config = read_yaml(config_file)
|
||||
|
||||
self.converter = TokenIDConverter(config['token_list'])
|
||||
self.tokenizer = CharTokenizer()
|
||||
self.frontend = WavFrontend(
|
||||
cmvn_file=cmvn_file,
|
||||
**config['frontend_conf']
|
||||
)
|
||||
self.ort_infer = torch.jit.load(model_file)
|
||||
self.batch_size = batch_size
|
||||
self.device_id = device_id
|
||||
self.plot_timestamp_to = plot_timestamp_to
|
||||
if "predictor_bias" in config['model_conf'].keys():
|
||||
self.pred_bias = config['model_conf']['predictor_bias']
|
||||
else:
|
||||
self.pred_bias = 0
|
||||
|
||||
def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List:
|
||||
waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
|
||||
waveform_nums = len(waveform_list)
|
||||
asr_res = []
|
||||
for beg_idx in range(0, waveform_nums, self.batch_size):
|
||||
|
||||
end_idx = min(waveform_nums, beg_idx + self.batch_size)
|
||||
feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
|
||||
try:
|
||||
with torch.no_grad():
|
||||
if int(self.device_id) == -1:
|
||||
outputs = self.ort_infer(feats, feats_len)
|
||||
am_scores, valid_token_lens = outputs[0], outputs[1]
|
||||
else:
|
||||
outputs = self.ort_infer(feats.cuda(), feats_len.cuda())
|
||||
am_scores, valid_token_lens = outputs[0].cpu(), outputs[1].cpu()
|
||||
if len(outputs) == 4:
|
||||
# for BiCifParaformer Inference
|
||||
us_alphas, us_peaks = outputs[2], outputs[3]
|
||||
else:
|
||||
us_alphas, us_peaks = None, None
|
||||
except:
|
||||
#logging.warning(traceback.format_exc())
|
||||
logging.warning("input wav is silence or noise")
|
||||
preds = ['']
|
||||
else:
|
||||
preds = self.decode(am_scores, valid_token_lens)
|
||||
if us_peaks is None:
|
||||
for pred in preds:
|
||||
pred = sentence_postprocess(pred)
|
||||
asr_res.append({'preds': pred})
|
||||
else:
|
||||
for pred, us_peaks_ in zip(preds, us_peaks):
|
||||
raw_tokens = pred
|
||||
timestamp, timestamp_raw = time_stamp_lfr6_onnx(us_peaks_, copy.copy(raw_tokens))
|
||||
text_proc, timestamp_proc, _ = sentence_postprocess(raw_tokens, timestamp_raw)
|
||||
# logging.warning(timestamp)
|
||||
if len(self.plot_timestamp_to):
|
||||
self.plot_wave_timestamp(waveform_list[0], timestamp, self.plot_timestamp_to)
|
||||
asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens})
|
||||
return asr_res
|
||||
|
||||
def plot_wave_timestamp(self, wav, text_timestamp, dest):
|
||||
# TODO: Plot the wav and timestamp results with matplotlib
|
||||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
matplotlib.rc("font", family='Alibaba PuHuiTi') # set it to a font that your system supports
|
||||
import matplotlib.pyplot as plt
|
||||
fig, ax1 = plt.subplots(figsize=(11, 3.5), dpi=320)
|
||||
ax2 = ax1.twinx()
|
||||
ax2.set_ylim([0, 2.0])
|
||||
# plot waveform
|
||||
ax1.set_ylim([-0.3, 0.3])
|
||||
time = np.arange(wav.shape[0]) / 16000
|
||||
ax1.plot(time, wav/wav.max()*0.3, color='gray', alpha=0.4)
|
||||
# plot lines and text
|
||||
for (char, start, end) in text_timestamp:
|
||||
ax1.vlines(start, -0.3, 0.3, ls='--')
|
||||
ax1.vlines(end, -0.3, 0.3, ls='--')
|
||||
x_adj = 0.045 if char != '<sil>' else 0.12
|
||||
ax1.text((start + end) * 0.5 - x_adj, 0, char)
|
||||
# plt.legend()
|
||||
plotname = "{}/timestamp.png".format(dest)
|
||||
plt.savefig(plotname, bbox_inches='tight')
|
||||
|
||||
def load_data(self,
|
||||
wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
|
||||
def load_wav(path: str) -> np.ndarray:
|
||||
waveform, _ = librosa.load(path, sr=fs)
|
||||
return waveform
|
||||
|
||||
if isinstance(wav_content, np.ndarray):
|
||||
return [wav_content]
|
||||
|
||||
if isinstance(wav_content, str):
|
||||
return [load_wav(wav_content)]
|
||||
|
||||
if isinstance(wav_content, list):
|
||||
return [load_wav(path) for path in wav_content]
|
||||
|
||||
raise TypeError(
|
||||
f'The type of {wav_content} is not in [str, np.ndarray, list]')
|
||||
|
||||
def extract_feat(self,
|
||||
waveform_list: List[np.ndarray]
|
||||
) -> Tuple[np.ndarray, np.ndarray]:
|
||||
feats, feats_len = [], []
|
||||
for waveform in waveform_list:
|
||||
speech, _ = self.frontend.fbank(waveform)
|
||||
feat, feat_len = self.frontend.lfr_cmvn(speech)
|
||||
feats.append(feat)
|
||||
feats_len.append(feat_len)
|
||||
|
||||
feats = self.pad_feats(feats, np.max(feats_len))
|
||||
feats_len = np.array(feats_len).astype(np.int32)
|
||||
feats = torch.from_numpy(feats).type(torch.float32)
|
||||
feats_len = torch.from_numpy(feats_len).type(torch.int32)
|
||||
return feats, feats_len
|
||||
|
||||
@staticmethod
|
||||
def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
|
||||
def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
|
||||
pad_width = ((0, max_feat_len - cur_len), (0, 0))
|
||||
return np.pad(feat, pad_width, 'constant', constant_values=0)
|
||||
|
||||
feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
|
||||
feats = np.array(feat_res).astype(np.float32)
|
||||
return feats
|
||||
|
||||
def infer(self, feats: np.ndarray,
|
||||
feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
outputs = self.ort_infer([feats, feats_len])
|
||||
return outputs
|
||||
|
||||
def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
|
||||
return [self.decode_one(am_score, token_num)
|
||||
for am_score, token_num in zip(am_scores, token_nums)]
|
||||
|
||||
def decode_one(self,
|
||||
am_score: np.ndarray,
|
||||
valid_token_num: int) -> List[str]:
|
||||
yseq = am_score.argmax(axis=-1)
|
||||
score = am_score.max(axis=-1)
|
||||
score = np.sum(score, axis=-1)
|
||||
|
||||
# pad with mask tokens to ensure compatibility with sos/eos tokens
|
||||
# asr_model.sos:1 asr_model.eos:2
|
||||
yseq = np.array([1] + yseq.tolist() + [2])
|
||||
hyp = Hypothesis(yseq=yseq, score=score)
|
||||
|
||||
# remove sos/eos and get results
|
||||
last_pos = -1
|
||||
token_int = hyp.yseq[1:last_pos].tolist()
|
||||
|
||||
# remove blank symbol id, which is assumed to be 0
|
||||
token_int = list(filter(lambda x: x not in (0, 2), token_int))
|
||||
|
||||
# Change integer-ids to tokens
|
||||
token = self.converter.ids2tokens(token_int)
|
||||
token = token[:valid_token_num-self.pred_bias]
|
||||
# texts = sentence_postprocess(token)
|
||||
return token
|
||||
|
||||
@@ -0,0 +1,157 @@
|
||||
import os
|
||||
import numpy as np
|
||||
import sys
|
||||
|
||||
def compute_wer(ref_file,
|
||||
hyp_file,
|
||||
cer_detail_file):
|
||||
rst = {
|
||||
'Wrd': 0,
|
||||
'Corr': 0,
|
||||
'Ins': 0,
|
||||
'Del': 0,
|
||||
'Sub': 0,
|
||||
'Snt': 0,
|
||||
'Err': 0.0,
|
||||
'S.Err': 0.0,
|
||||
'wrong_words': 0,
|
||||
'wrong_sentences': 0
|
||||
}
|
||||
|
||||
hyp_dict = {}
|
||||
ref_dict = {}
|
||||
with open(hyp_file, 'r') as hyp_reader:
|
||||
for line in hyp_reader:
|
||||
key = line.strip().split()[0]
|
||||
value = line.strip().split()[1:]
|
||||
hyp_dict[key] = value
|
||||
with open(ref_file, 'r') as ref_reader:
|
||||
for line in ref_reader:
|
||||
key = line.strip().split()[0]
|
||||
value = line.strip().split()[1:]
|
||||
ref_dict[key] = value
|
||||
|
||||
cer_detail_writer = open(cer_detail_file, 'w')
|
||||
for hyp_key in hyp_dict:
|
||||
if hyp_key in ref_dict:
|
||||
out_item = compute_wer_by_line(hyp_dict[hyp_key], ref_dict[hyp_key])
|
||||
rst['Wrd'] += out_item['nwords']
|
||||
rst['Corr'] += out_item['cor']
|
||||
rst['wrong_words'] += out_item['wrong']
|
||||
rst['Ins'] += out_item['ins']
|
||||
rst['Del'] += out_item['del']
|
||||
rst['Sub'] += out_item['sub']
|
||||
rst['Snt'] += 1
|
||||
if out_item['wrong'] > 0:
|
||||
rst['wrong_sentences'] += 1
|
||||
cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
|
||||
cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
|
||||
cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
|
||||
|
||||
if rst['Wrd'] > 0:
|
||||
rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)
|
||||
if rst['Snt'] > 0:
|
||||
rst['S.Err'] = round(rst['wrong_sentences'] * 100 / rst['Snt'], 2)
|
||||
|
||||
cer_detail_writer.write('\n')
|
||||
cer_detail_writer.write("%WER " + str(rst['Err']) + " [ " + str(rst['wrong_words'])+ " / " + str(rst['Wrd']) +
|
||||
", " + str(rst['Ins']) + " ins, " + str(rst['Del']) + " del, " + str(rst['Sub']) + " sub ]" + '\n')
|
||||
cer_detail_writer.write("%SER " + str(rst['S.Err']) + " [ " + str(rst['wrong_sentences']) + " / " + str(rst['Snt']) + " ]" + '\n')
|
||||
cer_detail_writer.write("Scored " + str(len(hyp_dict)) + " sentences, " + str(len(hyp_dict) - rst['Snt']) + " not present in hyp." + '\n')
|
||||
|
||||
|
||||
def compute_wer_by_line(hyp,
|
||||
ref):
|
||||
hyp = list(map(lambda x: x.lower(), hyp))
|
||||
ref = list(map(lambda x: x.lower(), ref))
|
||||
|
||||
len_hyp = len(hyp)
|
||||
len_ref = len(ref)
|
||||
|
||||
cost_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int16)
|
||||
|
||||
ops_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int8)
|
||||
|
||||
for i in range(len_hyp + 1):
|
||||
cost_matrix[i][0] = i
|
||||
for j in range(len_ref + 1):
|
||||
cost_matrix[0][j] = j
|
||||
|
||||
for i in range(1, len_hyp + 1):
|
||||
for j in range(1, len_ref + 1):
|
||||
if hyp[i - 1] == ref[j - 1]:
|
||||
cost_matrix[i][j] = cost_matrix[i - 1][j - 1]
|
||||
else:
|
||||
substitution = cost_matrix[i - 1][j - 1] + 1
|
||||
insertion = cost_matrix[i - 1][j] + 1
|
||||
deletion = cost_matrix[i][j - 1] + 1
|
||||
|
||||
compare_val = [substitution, insertion, deletion]
|
||||
|
||||
min_val = min(compare_val)
|
||||
operation_idx = compare_val.index(min_val) + 1
|
||||
cost_matrix[i][j] = min_val
|
||||
ops_matrix[i][j] = operation_idx
|
||||
|
||||
match_idx = []
|
||||
i = len_hyp
|
||||
j = len_ref
|
||||
rst = {
|
||||
'nwords': len_ref,
|
||||
'cor': 0,
|
||||
'wrong': 0,
|
||||
'ins': 0,
|
||||
'del': 0,
|
||||
'sub': 0
|
||||
}
|
||||
while i >= 0 or j >= 0:
|
||||
i_idx = max(0, i)
|
||||
j_idx = max(0, j)
|
||||
|
||||
if ops_matrix[i_idx][j_idx] == 0: # correct
|
||||
if i - 1 >= 0 and j - 1 >= 0:
|
||||
match_idx.append((j - 1, i - 1))
|
||||
rst['cor'] += 1
|
||||
|
||||
i -= 1
|
||||
j -= 1
|
||||
|
||||
elif ops_matrix[i_idx][j_idx] == 2: # insert
|
||||
i -= 1
|
||||
rst['ins'] += 1
|
||||
|
||||
elif ops_matrix[i_idx][j_idx] == 3: # delete
|
||||
j -= 1
|
||||
rst['del'] += 1
|
||||
|
||||
elif ops_matrix[i_idx][j_idx] == 1: # substitute
|
||||
i -= 1
|
||||
j -= 1
|
||||
rst['sub'] += 1
|
||||
|
||||
if i < 0 and j >= 0:
|
||||
rst['del'] += 1
|
||||
elif j < 0 and i >= 0:
|
||||
rst['ins'] += 1
|
||||
|
||||
match_idx.reverse()
|
||||
wrong_cnt = cost_matrix[len_hyp][len_ref]
|
||||
rst['wrong'] = wrong_cnt
|
||||
|
||||
return rst
|
||||
|
||||
def print_cer_detail(rst):
|
||||
return ("(" + "nwords=" + str(rst['nwords']) + ",cor=" + str(rst['cor'])
|
||||
+ ",ins=" + str(rst['ins']) + ",del=" + str(rst['del']) + ",sub="
|
||||
+ str(rst['sub']) + ") corr:" + '{:.2%}'.format(rst['cor']/rst['nwords'])
|
||||
+ ",cer:" + '{:.2%}'.format(rst['wrong']/rst['nwords']))
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) != 4:
|
||||
print("usage : python compute-wer.py test.ref test.hyp test.wer")
|
||||
sys.exit(0)
|
||||
|
||||
ref_file = sys.argv[1]
|
||||
hyp_file = sys.argv[2]
|
||||
cer_detail_file = sys.argv[3]
|
||||
compute_wer(ref_file, hyp_file, cer_detail_file)
|
||||
@@ -0,0 +1,191 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
from typeguard import check_argument_types
|
||||
import kaldi_native_fbank as knf
|
||||
|
||||
root_dir = Path(__file__).resolve().parent
|
||||
|
||||
logger_initialized = {}
|
||||
|
||||
|
||||
class WavFrontend():
|
||||
"""Conventional frontend structure for ASR.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cmvn_file: str = None,
|
||||
fs: int = 16000,
|
||||
window: str = 'hamming',
|
||||
n_mels: int = 80,
|
||||
frame_length: int = 25,
|
||||
frame_shift: int = 10,
|
||||
lfr_m: int = 1,
|
||||
lfr_n: int = 1,
|
||||
dither: float = 1.0,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
check_argument_types()
|
||||
|
||||
opts = knf.FbankOptions()
|
||||
opts.frame_opts.samp_freq = fs
|
||||
opts.frame_opts.dither = dither
|
||||
opts.frame_opts.window_type = window
|
||||
opts.frame_opts.frame_shift_ms = float(frame_shift)
|
||||
opts.frame_opts.frame_length_ms = float(frame_length)
|
||||
opts.mel_opts.num_bins = n_mels
|
||||
opts.energy_floor = 0
|
||||
opts.frame_opts.snip_edges = True
|
||||
opts.mel_opts.debug_mel = False
|
||||
self.opts = opts
|
||||
|
||||
self.lfr_m = lfr_m
|
||||
self.lfr_n = lfr_n
|
||||
self.cmvn_file = cmvn_file
|
||||
|
||||
if self.cmvn_file:
|
||||
self.cmvn = self.load_cmvn()
|
||||
self.fbank_fn = None
|
||||
self.fbank_beg_idx = 0
|
||||
self.reset_status()
|
||||
|
||||
def fbank(self,
|
||||
waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
waveform = waveform * (1 << 15)
|
||||
self.fbank_fn = knf.OnlineFbank(self.opts)
|
||||
self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
|
||||
frames = self.fbank_fn.num_frames_ready
|
||||
mat = np.empty([frames, self.opts.mel_opts.num_bins])
|
||||
for i in range(frames):
|
||||
mat[i, :] = self.fbank_fn.get_frame(i)
|
||||
feat = mat.astype(np.float32)
|
||||
feat_len = np.array(mat.shape[0]).astype(np.int32)
|
||||
return feat, feat_len
|
||||
|
||||
def fbank_online(self,
|
||||
waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
waveform = waveform * (1 << 15)
|
||||
# self.fbank_fn = knf.OnlineFbank(self.opts)
|
||||
self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
|
||||
frames = self.fbank_fn.num_frames_ready
|
||||
mat = np.empty([frames, self.opts.mel_opts.num_bins])
|
||||
for i in range(self.fbank_beg_idx, frames):
|
||||
mat[i, :] = self.fbank_fn.get_frame(i)
|
||||
# self.fbank_beg_idx += (frames-self.fbank_beg_idx)
|
||||
feat = mat.astype(np.float32)
|
||||
feat_len = np.array(mat.shape[0]).astype(np.int32)
|
||||
return feat, feat_len
|
||||
|
||||
def reset_status(self):
|
||||
self.fbank_fn = knf.OnlineFbank(self.opts)
|
||||
self.fbank_beg_idx = 0
|
||||
|
||||
def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
if self.lfr_m != 1 or self.lfr_n != 1:
|
||||
feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
|
||||
|
||||
if self.cmvn_file:
|
||||
feat = self.apply_cmvn(feat)
|
||||
|
||||
feat_len = np.array(feat.shape[0]).astype(np.int32)
|
||||
return feat, feat_len
|
||||
|
||||
@staticmethod
|
||||
def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
|
||||
LFR_inputs = []
|
||||
|
||||
T = inputs.shape[0]
|
||||
T_lfr = int(np.ceil(T / lfr_n))
|
||||
left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
|
||||
inputs = np.vstack((left_padding, inputs))
|
||||
T = T + (lfr_m - 1) // 2
|
||||
for i in range(T_lfr):
|
||||
if lfr_m <= T - i * lfr_n:
|
||||
LFR_inputs.append(
|
||||
(inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
|
||||
else:
|
||||
# process last LFR frame
|
||||
num_padding = lfr_m - (T - i * lfr_n)
|
||||
frame = inputs[i * lfr_n:].reshape(-1)
|
||||
for _ in range(num_padding):
|
||||
frame = np.hstack((frame, inputs[-1]))
|
||||
|
||||
LFR_inputs.append(frame)
|
||||
LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
|
||||
return LFR_outputs
|
||||
|
||||
def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Apply CMVN with mvn data
|
||||
"""
|
||||
frame, dim = inputs.shape
|
||||
means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
|
||||
vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
|
||||
inputs = (inputs + means) * vars
|
||||
return inputs
|
||||
|
||||
def load_cmvn(self,) -> np.ndarray:
|
||||
with open(self.cmvn_file, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
means_list = []
|
||||
vars_list = []
|
||||
for i in range(len(lines)):
|
||||
line_item = lines[i].split()
|
||||
if line_item[0] == '<AddShift>':
|
||||
line_item = lines[i + 1].split()
|
||||
if line_item[0] == '<LearnRateCoef>':
|
||||
add_shift_line = line_item[3:(len(line_item) - 1)]
|
||||
means_list = list(add_shift_line)
|
||||
continue
|
||||
elif line_item[0] == '<Rescale>':
|
||||
line_item = lines[i + 1].split()
|
||||
if line_item[0] == '<LearnRateCoef>':
|
||||
rescale_line = line_item[3:(len(line_item) - 1)]
|
||||
vars_list = list(rescale_line)
|
||||
continue
|
||||
|
||||
means = np.array(means_list).astype(np.float64)
|
||||
vars = np.array(vars_list).astype(np.float64)
|
||||
cmvn = np.array([means, vars])
|
||||
return cmvn
|
||||
|
||||
def load_bytes(input):
|
||||
middle_data = np.frombuffer(input, dtype=np.int16)
|
||||
middle_data = np.asarray(middle_data)
|
||||
if middle_data.dtype.kind not in 'iu':
|
||||
raise TypeError("'middle_data' must be an array of integers")
|
||||
dtype = np.dtype('float32')
|
||||
if dtype.kind != 'f':
|
||||
raise TypeError("'dtype' must be a floating point type")
|
||||
|
||||
i = np.iinfo(middle_data.dtype)
|
||||
abs_max = 2 ** (i.bits - 1)
|
||||
offset = i.min + abs_max
|
||||
array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
|
||||
return array
|
||||
|
||||
|
||||
def test():
|
||||
path = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav"
|
||||
import librosa
|
||||
cmvn_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/am.mvn"
|
||||
config_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/config.yaml"
|
||||
from funasr_local.runtime.python.onnxruntime.rapid_paraformer.utils.utils import read_yaml
|
||||
config = read_yaml(config_file)
|
||||
waveform, _ = librosa.load(path, sr=None)
|
||||
frontend = WavFrontend(
|
||||
cmvn_file=cmvn_file,
|
||||
**config['frontend_conf'],
|
||||
)
|
||||
speech, _ = frontend.fbank_online(waveform) #1d, (sample,), numpy
|
||||
feat, feat_len = frontend.lfr_cmvn(speech) # 2d, (frame, 450), np.float32 -> torch, torch.from_numpy(), dtype, (1, frame, 450)
|
||||
|
||||
frontend.reset_status() # clear cache
|
||||
return feat, feat_len
|
||||
|
||||
if __name__ == '__main__':
|
||||
test()
|
||||
@@ -0,0 +1,240 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import string
|
||||
import logging
|
||||
from typing import Any, List, Union
|
||||
|
||||
|
||||
def isChinese(ch: str):
|
||||
if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039':
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def isAllChinese(word: Union[List[Any], str]):
|
||||
word_lists = []
|
||||
for i in word:
|
||||
cur = i.replace(' ', '')
|
||||
cur = cur.replace('</s>', '')
|
||||
cur = cur.replace('<s>', '')
|
||||
word_lists.append(cur)
|
||||
|
||||
if len(word_lists) == 0:
|
||||
return False
|
||||
|
||||
for ch in word_lists:
|
||||
if isChinese(ch) is False:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def isAllAlpha(word: Union[List[Any], str]):
|
||||
word_lists = []
|
||||
for i in word:
|
||||
cur = i.replace(' ', '')
|
||||
cur = cur.replace('</s>', '')
|
||||
cur = cur.replace('<s>', '')
|
||||
word_lists.append(cur)
|
||||
|
||||
if len(word_lists) == 0:
|
||||
return False
|
||||
|
||||
for ch in word_lists:
|
||||
if ch.isalpha() is False and ch != "'":
|
||||
return False
|
||||
elif ch.isalpha() is True and isChinese(ch) is True:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
# def abbr_dispose(words: List[Any]) -> List[Any]:
|
||||
def abbr_dispose(words: List[Any], time_stamp: List[List] = None) -> List[Any]:
|
||||
words_size = len(words)
|
||||
word_lists = []
|
||||
abbr_begin = []
|
||||
abbr_end = []
|
||||
last_num = -1
|
||||
ts_lists = []
|
||||
ts_nums = []
|
||||
ts_index = 0
|
||||
for num in range(words_size):
|
||||
if num <= last_num:
|
||||
continue
|
||||
|
||||
if len(words[num]) == 1 and words[num].encode('utf-8').isalpha():
|
||||
if num + 1 < words_size and words[
|
||||
num + 1] == ' ' and num + 2 < words_size and len(
|
||||
words[num +
|
||||
2]) == 1 and words[num +
|
||||
2].encode('utf-8').isalpha():
|
||||
# found the begin of abbr
|
||||
abbr_begin.append(num)
|
||||
num += 2
|
||||
abbr_end.append(num)
|
||||
# to find the end of abbr
|
||||
while True:
|
||||
num += 1
|
||||
if num < words_size and words[num] == ' ':
|
||||
num += 1
|
||||
if num < words_size and len(
|
||||
words[num]) == 1 and words[num].encode(
|
||||
'utf-8').isalpha():
|
||||
abbr_end.pop()
|
||||
abbr_end.append(num)
|
||||
last_num = num
|
||||
else:
|
||||
break
|
||||
else:
|
||||
break
|
||||
|
||||
for num in range(words_size):
|
||||
if words[num] == ' ':
|
||||
ts_nums.append(ts_index)
|
||||
else:
|
||||
ts_nums.append(ts_index)
|
||||
ts_index += 1
|
||||
last_num = -1
|
||||
for num in range(words_size):
|
||||
if num <= last_num:
|
||||
continue
|
||||
|
||||
if num in abbr_begin:
|
||||
if time_stamp is not None:
|
||||
begin = time_stamp[ts_nums[num]][0]
|
||||
word_lists.append(words[num].upper())
|
||||
num += 1
|
||||
while num < words_size:
|
||||
if num in abbr_end:
|
||||
word_lists.append(words[num].upper())
|
||||
last_num = num
|
||||
break
|
||||
else:
|
||||
if words[num].encode('utf-8').isalpha():
|
||||
word_lists.append(words[num].upper())
|
||||
num += 1
|
||||
if time_stamp is not None:
|
||||
end = time_stamp[ts_nums[num]][1]
|
||||
ts_lists.append([begin, end])
|
||||
else:
|
||||
word_lists.append(words[num])
|
||||
if time_stamp is not None and words[num] != ' ':
|
||||
begin = time_stamp[ts_nums[num]][0]
|
||||
end = time_stamp[ts_nums[num]][1]
|
||||
ts_lists.append([begin, end])
|
||||
begin = end
|
||||
|
||||
if time_stamp is not None:
|
||||
return word_lists, ts_lists
|
||||
else:
|
||||
return word_lists
|
||||
|
||||
|
||||
def sentence_postprocess(words: List[Any], time_stamp: List[List] = None):
|
||||
middle_lists = []
|
||||
word_lists = []
|
||||
word_item = ''
|
||||
ts_lists = []
|
||||
|
||||
# wash words lists
|
||||
for i in words:
|
||||
word = ''
|
||||
if isinstance(i, str):
|
||||
word = i
|
||||
else:
|
||||
word = i.decode('utf-8')
|
||||
|
||||
if word in ['<s>', '</s>', '<unk>']:
|
||||
continue
|
||||
else:
|
||||
middle_lists.append(word)
|
||||
|
||||
# all chinese characters
|
||||
if isAllChinese(middle_lists):
|
||||
for i, ch in enumerate(middle_lists):
|
||||
word_lists.append(ch.replace(' ', ''))
|
||||
if time_stamp is not None:
|
||||
ts_lists = time_stamp
|
||||
|
||||
# all alpha characters
|
||||
elif isAllAlpha(middle_lists):
|
||||
ts_flag = True
|
||||
for i, ch in enumerate(middle_lists):
|
||||
if ts_flag and time_stamp is not None:
|
||||
begin = time_stamp[i][0]
|
||||
end = time_stamp[i][1]
|
||||
word = ''
|
||||
if '@@' in ch:
|
||||
word = ch.replace('@@', '')
|
||||
word_item += word
|
||||
if time_stamp is not None:
|
||||
ts_flag = False
|
||||
end = time_stamp[i][1]
|
||||
else:
|
||||
word_item += ch
|
||||
word_lists.append(word_item)
|
||||
word_lists.append(' ')
|
||||
word_item = ''
|
||||
if time_stamp is not None:
|
||||
ts_flag = True
|
||||
end = time_stamp[i][1]
|
||||
ts_lists.append([begin, end])
|
||||
begin = end
|
||||
|
||||
# mix characters
|
||||
else:
|
||||
alpha_blank = False
|
||||
ts_flag = True
|
||||
begin = -1
|
||||
end = -1
|
||||
for i, ch in enumerate(middle_lists):
|
||||
if ts_flag and time_stamp is not None:
|
||||
begin = time_stamp[i][0]
|
||||
end = time_stamp[i][1]
|
||||
word = ''
|
||||
if isAllChinese(ch):
|
||||
if alpha_blank is True:
|
||||
word_lists.pop()
|
||||
word_lists.append(ch)
|
||||
alpha_blank = False
|
||||
if time_stamp is not None:
|
||||
ts_flag = True
|
||||
ts_lists.append([begin, end])
|
||||
begin = end
|
||||
elif '@@' in ch:
|
||||
word = ch.replace('@@', '')
|
||||
word_item += word
|
||||
alpha_blank = False
|
||||
if time_stamp is not None:
|
||||
ts_flag = False
|
||||
end = time_stamp[i][1]
|
||||
elif isAllAlpha(ch):
|
||||
word_item += ch
|
||||
word_lists.append(word_item)
|
||||
word_lists.append(' ')
|
||||
word_item = ''
|
||||
alpha_blank = True
|
||||
if time_stamp is not None:
|
||||
ts_flag = True
|
||||
end = time_stamp[i][1]
|
||||
ts_lists.append([begin, end])
|
||||
begin = end
|
||||
else:
|
||||
raise ValueError('invalid character: {}'.format(ch))
|
||||
|
||||
if time_stamp is not None:
|
||||
word_lists, ts_lists = abbr_dispose(word_lists, ts_lists)
|
||||
real_word_lists = []
|
||||
for ch in word_lists:
|
||||
if ch != ' ':
|
||||
real_word_lists.append(ch)
|
||||
sentence = ' '.join(real_word_lists).strip()
|
||||
return sentence, ts_lists, real_word_lists
|
||||
else:
|
||||
word_lists = abbr_dispose(word_lists)
|
||||
real_word_lists = []
|
||||
for ch in word_lists:
|
||||
if ch != ' ':
|
||||
real_word_lists.append(ch)
|
||||
sentence = ''.join(word_lists).strip()
|
||||
return sentence, real_word_lists
|
||||
@@ -0,0 +1,59 @@
|
||||
import numpy as np
|
||||
|
||||
|
||||
def time_stamp_lfr6_onnx(us_cif_peak, char_list, begin_time=0.0, total_offset=-1.5):
|
||||
if not len(char_list):
|
||||
return []
|
||||
START_END_THRESHOLD = 5
|
||||
MAX_TOKEN_DURATION = 30
|
||||
TIME_RATE = 10.0 * 6 / 1000 / 3 # 3 times upsampled
|
||||
cif_peak = us_cif_peak.reshape(-1)
|
||||
num_frames = cif_peak.shape[-1]
|
||||
if char_list[-1] == '</s>':
|
||||
char_list = char_list[:-1]
|
||||
# char_list = [i for i in text]
|
||||
timestamp_list = []
|
||||
new_char_list = []
|
||||
# for bicif model trained with large data, cif2 actually fires when a character starts
|
||||
# so treat the frames between two peaks as the duration of the former token
|
||||
fire_place = np.where(cif_peak>1.0-1e-4)[0] + total_offset # np format
|
||||
num_peak = len(fire_place)
|
||||
assert num_peak == len(char_list) + 1 # number of peaks is supposed to be number of tokens + 1
|
||||
# begin silence
|
||||
if fire_place[0] > START_END_THRESHOLD:
|
||||
# char_list.insert(0, '<sil>')
|
||||
timestamp_list.append([0.0, fire_place[0]*TIME_RATE])
|
||||
new_char_list.append('<sil>')
|
||||
# tokens timestamp
|
||||
for i in range(len(fire_place)-1):
|
||||
new_char_list.append(char_list[i])
|
||||
if i == len(fire_place)-2 or MAX_TOKEN_DURATION < 0 or fire_place[i+1] - fire_place[i] < MAX_TOKEN_DURATION:
|
||||
timestamp_list.append([fire_place[i]*TIME_RATE, fire_place[i+1]*TIME_RATE])
|
||||
else:
|
||||
# cut the duration to token and sil of the 0-weight frames last long
|
||||
_split = fire_place[i] + MAX_TOKEN_DURATION
|
||||
timestamp_list.append([fire_place[i]*TIME_RATE, _split*TIME_RATE])
|
||||
timestamp_list.append([_split*TIME_RATE, fire_place[i+1]*TIME_RATE])
|
||||
new_char_list.append('<sil>')
|
||||
# tail token and end silence
|
||||
if num_frames - fire_place[-1] > START_END_THRESHOLD:
|
||||
_end = (num_frames + fire_place[-1]) / 2
|
||||
timestamp_list[-1][1] = _end*TIME_RATE
|
||||
timestamp_list.append([_end*TIME_RATE, num_frames*TIME_RATE])
|
||||
new_char_list.append("<sil>")
|
||||
else:
|
||||
timestamp_list[-1][1] = num_frames*TIME_RATE
|
||||
if begin_time: # add offset time in model with vad
|
||||
for i in range(len(timestamp_list)):
|
||||
timestamp_list[i][0] = timestamp_list[i][0] + begin_time / 1000.0
|
||||
timestamp_list[i][1] = timestamp_list[i][1] + begin_time / 1000.0
|
||||
assert len(new_char_list) == len(timestamp_list)
|
||||
res_str = ""
|
||||
for char, timestamp in zip(new_char_list, timestamp_list):
|
||||
res_str += "{} {} {};".format(char, timestamp[0], timestamp[1])
|
||||
res = []
|
||||
for char, timestamp in zip(new_char_list, timestamp_list):
|
||||
if char != '<sil>':
|
||||
res.append([int(timestamp[0] * 1000), int(timestamp[1] * 1000)])
|
||||
return res_str, res
|
||||
|
||||
162
funasr_local/runtime/python/libtorch/funasr_torch/utils/utils.py
Normal file
162
funasr_local/runtime/python/libtorch/funasr_torch/utils/utils.py
Normal file
@@ -0,0 +1,162 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
import functools
|
||||
import logging
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import yaml
|
||||
|
||||
from typeguard import check_argument_types
|
||||
|
||||
import warnings
|
||||
|
||||
root_dir = Path(__file__).resolve().parent
|
||||
|
||||
logger_initialized = {}
|
||||
|
||||
|
||||
class TokenIDConverter():
|
||||
def __init__(self, token_list: Union[List, str],
|
||||
):
|
||||
check_argument_types()
|
||||
|
||||
self.token_list = token_list
|
||||
self.unk_symbol = token_list[-1]
|
||||
self.token2id = {v: i for i, v in enumerate(self.token_list)}
|
||||
self.unk_id = self.token2id[self.unk_symbol]
|
||||
|
||||
|
||||
def get_num_vocabulary_size(self) -> int:
|
||||
return len(self.token_list)
|
||||
|
||||
def ids2tokens(self,
|
||||
integers: Union[np.ndarray, Iterable[int]]) -> List[str]:
|
||||
if isinstance(integers, np.ndarray) and integers.ndim != 1:
|
||||
raise TokenIDConverterError(
|
||||
f"Must be 1 dim ndarray, but got {integers.ndim}")
|
||||
return [self.token_list[i] for i in integers]
|
||||
|
||||
def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
|
||||
|
||||
return [self.token2id.get(i, self.unk_id) for i in tokens]
|
||||
|
||||
|
||||
class CharTokenizer():
|
||||
def __init__(
|
||||
self,
|
||||
symbol_value: Union[Path, str, Iterable[str]] = None,
|
||||
space_symbol: str = "<space>",
|
||||
remove_non_linguistic_symbols: bool = False,
|
||||
):
|
||||
check_argument_types()
|
||||
|
||||
self.space_symbol = space_symbol
|
||||
self.non_linguistic_symbols = self.load_symbols(symbol_value)
|
||||
self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
|
||||
|
||||
@staticmethod
|
||||
def load_symbols(value: Union[Path, str, Iterable[str]] = None) -> Set:
|
||||
if value is None:
|
||||
return set()
|
||||
|
||||
if isinstance(value, Iterable[str]):
|
||||
return set(value)
|
||||
|
||||
file_path = Path(value)
|
||||
if not file_path.exists():
|
||||
logging.warning("%s doesn't exist.", file_path)
|
||||
return set()
|
||||
|
||||
with file_path.open("r", encoding="utf-8") as f:
|
||||
return set(line.rstrip() for line in f)
|
||||
|
||||
def text2tokens(self, line: Union[str, list]) -> List[str]:
|
||||
tokens = []
|
||||
while len(line) != 0:
|
||||
for w in self.non_linguistic_symbols:
|
||||
if line.startswith(w):
|
||||
if not self.remove_non_linguistic_symbols:
|
||||
tokens.append(line[: len(w)])
|
||||
line = line[len(w):]
|
||||
break
|
||||
else:
|
||||
t = line[0]
|
||||
if t == " ":
|
||||
t = "<space>"
|
||||
tokens.append(t)
|
||||
line = line[1:]
|
||||
return tokens
|
||||
|
||||
def tokens2text(self, tokens: Iterable[str]) -> str:
|
||||
tokens = [t if t != self.space_symbol else " " for t in tokens]
|
||||
return "".join(tokens)
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
f"{self.__class__.__name__}("
|
||||
f'space_symbol="{self.space_symbol}"'
|
||||
f'non_linguistic_symbols="{self.non_linguistic_symbols}"'
|
||||
f")"
|
||||
)
|
||||
|
||||
|
||||
|
||||
class Hypothesis(NamedTuple):
|
||||
"""Hypothesis data type."""
|
||||
|
||||
yseq: np.ndarray
|
||||
score: Union[float, np.ndarray] = 0
|
||||
scores: Dict[str, Union[float, np.ndarray]] = dict()
|
||||
states: Dict[str, Any] = dict()
|
||||
|
||||
def asdict(self) -> dict:
|
||||
"""Convert data to JSON-friendly dict."""
|
||||
return self._replace(
|
||||
yseq=self.yseq.tolist(),
|
||||
score=float(self.score),
|
||||
scores={k: float(v) for k, v in self.scores.items()},
|
||||
)._asdict()
|
||||
|
||||
|
||||
def read_yaml(yaml_path: Union[str, Path]) -> Dict:
|
||||
if not Path(yaml_path).exists():
|
||||
raise FileExistsError(f'The {yaml_path} does not exist.')
|
||||
|
||||
with open(str(yaml_path), 'rb') as f:
|
||||
data = yaml.load(f, Loader=yaml.Loader)
|
||||
return data
|
||||
|
||||
|
||||
@functools.lru_cache()
|
||||
def get_logger(name='funasr_local_torch'):
|
||||
"""Initialize and get a logger by name.
|
||||
If the logger has not been initialized, this method will initialize the
|
||||
logger by adding one or two handlers, otherwise the initialized logger will
|
||||
be directly returned. During initialization, a StreamHandler will always be
|
||||
added.
|
||||
Args:
|
||||
name (str): Logger name.
|
||||
Returns:
|
||||
logging.Logger: The expected logger.
|
||||
"""
|
||||
logger = logging.getLogger(name)
|
||||
if name in logger_initialized:
|
||||
return logger
|
||||
|
||||
for logger_name in logger_initialized:
|
||||
if name.startswith(logger_name):
|
||||
return logger
|
||||
|
||||
formatter = logging.Formatter(
|
||||
'[%(asctime)s] %(name)s %(levelname)s: %(message)s',
|
||||
datefmt="%Y/%m/%d %H:%M:%S")
|
||||
|
||||
sh = logging.StreamHandler()
|
||||
sh.setFormatter(formatter)
|
||||
logger.addHandler(sh)
|
||||
logger_initialized[name] = True
|
||||
logger.propagate = False
|
||||
return logger
|
||||
43
funasr_local/runtime/python/libtorch/setup.py
Normal file
43
funasr_local/runtime/python/libtorch/setup.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
from pathlib import Path
|
||||
import setuptools
|
||||
from setuptools import find_packages
|
||||
|
||||
def get_readme():
|
||||
root_dir = Path(__file__).resolve().parent
|
||||
readme_path = str(root_dir / 'README.md')
|
||||
print(readme_path)
|
||||
with open(readme_path, 'r', encoding='utf-8') as f:
|
||||
readme = f.read()
|
||||
return readme
|
||||
|
||||
|
||||
|
||||
setuptools.setup(
|
||||
name='funasr_local_torch',
|
||||
version='0.0.4',
|
||||
platforms="Any",
|
||||
url="https://github.com/alibaba-damo-academy/FunASR.git",
|
||||
author="Speech Lab of DAMO Academy, Alibaba Group",
|
||||
author_email="funasr_local@list.alibaba-inc.com",
|
||||
description="FunASR: A Fundamental End-to-End Speech Recognition Toolkit",
|
||||
license="The MIT License",
|
||||
long_description=get_readme(),
|
||||
long_description_content_type='text/markdown',
|
||||
include_package_data=True,
|
||||
install_requires=["librosa", "onnxruntime>=1.7.0",
|
||||
"scipy", "numpy>=1.19.3",
|
||||
"typeguard", "kaldi-native-fbank",
|
||||
"PyYAML>=5.1.2", "torch-quant >= 0.4.0"],
|
||||
packages=find_packages(include=["torch_paraformer*"]),
|
||||
keywords=[
|
||||
'funasr_local, paraformer, funasr_local_torch'
|
||||
],
|
||||
classifiers=[
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: 3.9',
|
||||
'Programming Language :: Python :: 3.10',
|
||||
],
|
||||
)
|
||||
187
funasr_local/runtime/python/onnxruntime/README.md
Normal file
187
funasr_local/runtime/python/onnxruntime/README.md
Normal file
@@ -0,0 +1,187 @@
|
||||
# ONNXRuntime-python
|
||||
|
||||
## Export the model
|
||||
### Install [modelscope and funasr](https://github.com/alibaba-damo-academy/FunASR#installation)
|
||||
|
||||
```shell
|
||||
#pip3 install torch torchaudio
|
||||
pip install -U modelscope funasr
|
||||
# For the users in China, you could install with the command:
|
||||
# pip install -U modelscope funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
|
||||
pip install torch-quant # Optional, for torchscript quantization
|
||||
pip install onnx onnxruntime # Optional, for onnx quantization
|
||||
```
|
||||
|
||||
### Export [onnx model](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export)
|
||||
|
||||
```shell
|
||||
python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True
|
||||
```
|
||||
|
||||
|
||||
## Install `funasr_onnx`
|
||||
|
||||
install from pip
|
||||
```shell
|
||||
pip install -U funasr_onnx
|
||||
# For the users in China, you could install with the command:
|
||||
# pip install -U funasr_onnx -i https://mirror.sjtu.edu.cn/pypi/web/simple
|
||||
```
|
||||
|
||||
or install from source code
|
||||
|
||||
```shell
|
||||
git clone https://github.com/alibaba/FunASR.git && cd FunASR
|
||||
cd funasr/runtime/python/onnxruntime
|
||||
pip install -e ./
|
||||
# For the users in China, you could install with the command:
|
||||
# pip install -e ./ -i https://mirror.sjtu.edu.cn/pypi/web/simple
|
||||
```
|
||||
|
||||
## Inference with runtime
|
||||
|
||||
### Speech Recognition
|
||||
#### Paraformer
|
||||
```python
|
||||
from funasr_onnx import Paraformer
|
||||
|
||||
model_dir = "./export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||
model = Paraformer(model_dir, batch_size=1, quantize=True)
|
||||
|
||||
wav_path = ['./export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
|
||||
|
||||
result = model(wav_path)
|
||||
print(result)
|
||||
```
|
||||
- `model_dir`: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
|
||||
- `batch_size`: `1` (Default), the batch size duration inference
|
||||
- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
|
||||
- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
|
||||
- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
|
||||
|
||||
Input: wav formt file, support formats: `str, np.ndarray, List[str]`
|
||||
|
||||
Output: `List[str]`: recognition result
|
||||
|
||||
#### Paraformer-online
|
||||
|
||||
### Voice Activity Detection
|
||||
#### FSMN-VAD
|
||||
```python
|
||||
from funasr_onnx import Fsmn_vad
|
||||
|
||||
model_dir = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
||||
wav_path = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav"
|
||||
model = Fsmn_vad(model_dir)
|
||||
|
||||
result = model(wav_path)
|
||||
print(result)
|
||||
```
|
||||
- `model_dir`: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
|
||||
- `batch_size`: `1` (Default), the batch size duration inference
|
||||
- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
|
||||
- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
|
||||
- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
|
||||
|
||||
Input: wav formt file, support formats: `str, np.ndarray, List[str]`
|
||||
|
||||
Output: `List[str]`: recognition result
|
||||
|
||||
|
||||
#### FSMN-VAD-online
|
||||
```python
|
||||
from funasr_onnx import Fsmn_vad_online
|
||||
import soundfile
|
||||
|
||||
|
||||
model_dir = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
||||
wav_path = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav"
|
||||
model = Fsmn_vad_online(model_dir)
|
||||
|
||||
|
||||
##online vad
|
||||
speech, sample_rate = soundfile.read(wav_path)
|
||||
speech_length = speech.shape[0]
|
||||
#
|
||||
sample_offset = 0
|
||||
step = 1600
|
||||
param_dict = {'in_cache': []}
|
||||
for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
|
||||
if sample_offset + step >= speech_length - 1:
|
||||
step = speech_length - sample_offset
|
||||
is_final = True
|
||||
else:
|
||||
is_final = False
|
||||
param_dict['is_final'] = is_final
|
||||
segments_result = model(audio_in=speech[sample_offset: sample_offset + step],
|
||||
param_dict=param_dict)
|
||||
if segments_result:
|
||||
print(segments_result)
|
||||
```
|
||||
- `model_dir`: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
|
||||
- `batch_size`: `1` (Default), the batch size duration inference
|
||||
- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
|
||||
- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
|
||||
- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
|
||||
|
||||
Input: wav formt file, support formats: `str, np.ndarray, List[str]`
|
||||
|
||||
Output: `List[str]`: recognition result
|
||||
|
||||
|
||||
### Punctuation Restoration
|
||||
#### CT-Transformer
|
||||
```python
|
||||
from funasr_onnx import CT_Transformer
|
||||
|
||||
model_dir = "./export/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
|
||||
model = CT_Transformer(model_dir)
|
||||
|
||||
text_in="跨境河流是养育沿岸人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切愿意进一步完善双方联合工作机制凡是中方能做的我们都会去做而且会做得更好我请印度朋友们放心中国在上游的任何开发利用都会经过科学规划和论证兼顾上下游的利益"
|
||||
result = model(text_in)
|
||||
print(result[0])
|
||||
```
|
||||
- `model_dir`: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
|
||||
- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
|
||||
- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
|
||||
- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
|
||||
|
||||
Input: `str`, raw text of asr result
|
||||
|
||||
Output: `List[str]`: recognition result
|
||||
|
||||
|
||||
#### CT-Transformer-online
|
||||
```python
|
||||
from funasr_onnx import CT_Transformer_VadRealtime
|
||||
|
||||
model_dir = "./export/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
|
||||
model = CT_Transformer_VadRealtime(model_dir)
|
||||
|
||||
text_in = "跨境河流是养育沿岸|人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员|在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险|向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流>问题上的关切|愿意进一步完善双方联合工作机制|凡是|中方能做的我们|都会去做而且会做得更好我请印度朋友们放心中国在上游的|任何开发利用都会经过科学|规划和论证兼顾上下游的利益"
|
||||
|
||||
vads = text_in.split("|")
|
||||
rec_result_all=""
|
||||
param_dict = {"cache": []}
|
||||
for vad in vads:
|
||||
result = model(vad, param_dict=param_dict)
|
||||
rec_result_all += result[0]
|
||||
|
||||
print(rec_result_all)
|
||||
```
|
||||
- `model_dir`: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
|
||||
- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
|
||||
- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
|
||||
- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
|
||||
|
||||
Input: `str`, raw text of asr result
|
||||
|
||||
Output: `List[str]`: recognition result
|
||||
|
||||
## Performance benchmark
|
||||
|
||||
Please ref to [benchmark](https://github.com/alibaba-damo-academy/FunASR/blob/main/funasr/runtime/python/benchmark_onnx.md)
|
||||
|
||||
## Acknowledge
|
||||
1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
|
||||
2. We acknowledge [SWHL](https://github.com/RapidAI/RapidASR) for contributing the onnxruntime (for paraformer model).
|
||||
0
funasr_local/runtime/python/onnxruntime/__init__.py
Normal file
0
funasr_local/runtime/python/onnxruntime/__init__.py
Normal file
15
funasr_local/runtime/python/onnxruntime/demo.py
Normal file
15
funasr_local/runtime/python/onnxruntime/demo.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from funasr_local_onnx import Paraformer
|
||||
|
||||
|
||||
model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||
|
||||
model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0) # cpu
|
||||
# model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0, device_id=0) # gpu
|
||||
|
||||
# when using paraformer-large-vad-punc model, you can set plot_timestamp_to="./xx.png" to get figure of alignment besides timestamps
|
||||
# model = Paraformer(model_dir, batch_size=1, plot_timestamp_to="test.png")
|
||||
|
||||
wav_path = "YourPath/xx.wav"
|
||||
|
||||
result = model(wav_path)
|
||||
print(result)
|
||||
@@ -0,0 +1,8 @@
|
||||
from funasr_local_onnx import CT_Transformer
|
||||
|
||||
model_dir = "../../../export/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
|
||||
model = CT_Transformer(model_dir)
|
||||
|
||||
text_in="跨境河流是养育沿岸人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切愿意进一步完善双方联合工作机制凡是中方能做的我们都会去做而且会做得更好我请印度朋友们放心中国在上游的任何开发利用都会经过科学规划和论证兼顾上下游的利益"
|
||||
result = model(text_in)
|
||||
print(result[0])
|
||||
15
funasr_local/runtime/python/onnxruntime/demo_punc_online.py
Normal file
15
funasr_local/runtime/python/onnxruntime/demo_punc_online.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from funasr_local_onnx import CT_Transformer_VadRealtime
|
||||
|
||||
model_dir = "../../../export/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
|
||||
model = CT_Transformer_VadRealtime(model_dir)
|
||||
|
||||
text_in = "跨境河流是养育沿岸|人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员|在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险|向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流>问题上的关切|愿意进一步完善双方联合工作机制|凡是|中方能做的我们|都会去做而且会做得更好我请印度朋友们放心中国在上游的|任何开发利用都会经过科学|规划和论证兼顾上下游的利益"
|
||||
|
||||
vads = text_in.split("|")
|
||||
rec_result_all=""
|
||||
param_dict = {"cache": []}
|
||||
for vad in vads:
|
||||
result = model(vad, param_dict=param_dict)
|
||||
rec_result_all += result[0]
|
||||
|
||||
print(rec_result_all)
|
||||
11
funasr_local/runtime/python/onnxruntime/demo_vad_offline.py
Normal file
11
funasr_local/runtime/python/onnxruntime/demo_vad_offline.py
Normal file
@@ -0,0 +1,11 @@
|
||||
import soundfile
|
||||
from funasr_local_onnx import Fsmn_vad
|
||||
|
||||
|
||||
model_dir = "/mnt/ailsa.zly/tfbase/espnet_work/FunASR_dev_zly/export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
||||
wav_path = "/mnt/ailsa.zly/tfbase/espnet_work/FunASR_dev_zly/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/vad_example_16k.wav"
|
||||
model = Fsmn_vad(model_dir)
|
||||
|
||||
#offline vad
|
||||
result = model(wav_path)
|
||||
print(result)
|
||||
28
funasr_local/runtime/python/onnxruntime/demo_vad_online.py
Normal file
28
funasr_local/runtime/python/onnxruntime/demo_vad_online.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import soundfile
|
||||
from funasr_local_onnx import Fsmn_vad_online
|
||||
|
||||
|
||||
model_dir = "/mnt/ailsa.zly/tfbase/espnet_work/FunASR_dev_zly/export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
||||
wav_path = "/mnt/ailsa.zly/tfbase/espnet_work/FunASR_dev_zly/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/vad_example_16k.wav"
|
||||
model = Fsmn_vad_online(model_dir)
|
||||
|
||||
|
||||
##online vad
|
||||
speech, sample_rate = soundfile.read(wav_path)
|
||||
speech_length = speech.shape[0]
|
||||
#
|
||||
sample_offset = 0
|
||||
step = 1600
|
||||
param_dict = {'in_cache': []}
|
||||
for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
|
||||
if sample_offset + step >= speech_length - 1:
|
||||
step = speech_length - sample_offset
|
||||
is_final = True
|
||||
else:
|
||||
is_final = False
|
||||
param_dict['is_final'] = is_final
|
||||
segments_result = model(audio_in=speech[sample_offset: sample_offset + step],
|
||||
param_dict=param_dict)
|
||||
if segments_result:
|
||||
print(segments_result)
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
from .paraformer_bin import Paraformer
|
||||
from .vad_bin import Fsmn_vad
|
||||
from .vad_bin import Fsmn_vad_online
|
||||
from .punc_bin import CT_Transformer
|
||||
from .punc_bin import CT_Transformer_VadRealtime
|
||||
@@ -0,0 +1,196 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
# MIT License (https://opensource.org/licenses/MIT)
|
||||
|
||||
import os.path
|
||||
from pathlib import Path
|
||||
from typing import List, Union, Tuple
|
||||
|
||||
import copy
|
||||
import librosa
|
||||
import numpy as np
|
||||
|
||||
from .utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
|
||||
OrtInferSession, TokenIDConverter, get_logger,
|
||||
read_yaml)
|
||||
from .utils.postprocess_utils import sentence_postprocess
|
||||
from .utils.frontend import WavFrontend
|
||||
from .utils.timestamp_utils import time_stamp_lfr6_onnx
|
||||
|
||||
logging = get_logger()
|
||||
|
||||
|
||||
class Paraformer():
|
||||
"""
|
||||
Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
|
||||
https://arxiv.org/abs/2206.08317
|
||||
"""
|
||||
def __init__(self, model_dir: Union[str, Path] = None,
|
||||
batch_size: int = 1,
|
||||
device_id: Union[str, int] = "-1",
|
||||
plot_timestamp_to: str = "",
|
||||
quantize: bool = False,
|
||||
intra_op_num_threads: int = 4,
|
||||
):
|
||||
|
||||
if not Path(model_dir).exists():
|
||||
raise FileNotFoundError(f'{model_dir} does not exist.')
|
||||
|
||||
model_file = os.path.join(model_dir, 'model.onnx')
|
||||
if quantize:
|
||||
model_file = os.path.join(model_dir, 'model_quant.onnx')
|
||||
config_file = os.path.join(model_dir, 'config.yaml')
|
||||
cmvn_file = os.path.join(model_dir, 'am.mvn')
|
||||
config = read_yaml(config_file)
|
||||
|
||||
self.converter = TokenIDConverter(config['token_list'])
|
||||
self.tokenizer = CharTokenizer()
|
||||
self.frontend = WavFrontend(
|
||||
cmvn_file=cmvn_file,
|
||||
**config['frontend_conf']
|
||||
)
|
||||
self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads)
|
||||
self.batch_size = batch_size
|
||||
self.plot_timestamp_to = plot_timestamp_to
|
||||
if "predictor_bias" in config['model_conf'].keys():
|
||||
self.pred_bias = config['model_conf']['predictor_bias']
|
||||
else:
|
||||
self.pred_bias = 0
|
||||
|
||||
def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List:
|
||||
waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
|
||||
waveform_nums = len(waveform_list)
|
||||
asr_res = []
|
||||
for beg_idx in range(0, waveform_nums, self.batch_size):
|
||||
|
||||
end_idx = min(waveform_nums, beg_idx + self.batch_size)
|
||||
feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
|
||||
try:
|
||||
outputs = self.infer(feats, feats_len)
|
||||
am_scores, valid_token_lens = outputs[0], outputs[1]
|
||||
if len(outputs) == 4:
|
||||
# for BiCifParaformer Inference
|
||||
us_alphas, us_peaks = outputs[2], outputs[3]
|
||||
else:
|
||||
us_alphas, us_peaks = None, None
|
||||
except ONNXRuntimeError:
|
||||
#logging.warning(traceback.format_exc())
|
||||
logging.warning("input wav is silence or noise")
|
||||
preds = ['']
|
||||
else:
|
||||
preds = self.decode(am_scores, valid_token_lens)
|
||||
if us_peaks is None:
|
||||
for pred in preds:
|
||||
pred = sentence_postprocess(pred)
|
||||
asr_res.append({'preds': pred})
|
||||
else:
|
||||
for pred, us_peaks_ in zip(preds, us_peaks):
|
||||
raw_tokens = pred
|
||||
timestamp, timestamp_raw = time_stamp_lfr6_onnx(us_peaks_, copy.copy(raw_tokens))
|
||||
text_proc, timestamp_proc, _ = sentence_postprocess(raw_tokens, timestamp_raw)
|
||||
# logging.warning(timestamp)
|
||||
if len(self.plot_timestamp_to):
|
||||
self.plot_wave_timestamp(waveform_list[0], timestamp, self.plot_timestamp_to)
|
||||
asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens})
|
||||
return asr_res
|
||||
|
||||
def plot_wave_timestamp(self, wav, text_timestamp, dest):
|
||||
# TODO: Plot the wav and timestamp results with matplotlib
|
||||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
matplotlib.rc("font", family='Alibaba PuHuiTi') # set it to a font that your system supports
|
||||
import matplotlib.pyplot as plt
|
||||
fig, ax1 = plt.subplots(figsize=(11, 3.5), dpi=320)
|
||||
ax2 = ax1.twinx()
|
||||
ax2.set_ylim([0, 2.0])
|
||||
# plot waveform
|
||||
ax1.set_ylim([-0.3, 0.3])
|
||||
time = np.arange(wav.shape[0]) / 16000
|
||||
ax1.plot(time, wav/wav.max()*0.3, color='gray', alpha=0.4)
|
||||
# plot lines and text
|
||||
for (char, start, end) in text_timestamp:
|
||||
ax1.vlines(start, -0.3, 0.3, ls='--')
|
||||
ax1.vlines(end, -0.3, 0.3, ls='--')
|
||||
x_adj = 0.045 if char != '<sil>' else 0.12
|
||||
ax1.text((start + end) * 0.5 - x_adj, 0, char)
|
||||
# plt.legend()
|
||||
plotname = "{}/timestamp.png".format(dest)
|
||||
plt.savefig(plotname, bbox_inches='tight')
|
||||
|
||||
def load_data(self,
|
||||
wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
|
||||
def load_wav(path: str) -> np.ndarray:
|
||||
waveform, _ = librosa.load(path, sr=fs)
|
||||
return waveform
|
||||
|
||||
if isinstance(wav_content, np.ndarray):
|
||||
return [wav_content]
|
||||
|
||||
if isinstance(wav_content, str):
|
||||
return [load_wav(wav_content)]
|
||||
|
||||
if isinstance(wav_content, list):
|
||||
return [load_wav(path) for path in wav_content]
|
||||
|
||||
raise TypeError(
|
||||
f'The type of {wav_content} is not in [str, np.ndarray, list]')
|
||||
|
||||
def extract_feat(self,
|
||||
waveform_list: List[np.ndarray]
|
||||
) -> Tuple[np.ndarray, np.ndarray]:
|
||||
feats, feats_len = [], []
|
||||
for waveform in waveform_list:
|
||||
speech, _ = self.frontend.fbank(waveform)
|
||||
feat, feat_len = self.frontend.lfr_cmvn(speech)
|
||||
feats.append(feat)
|
||||
feats_len.append(feat_len)
|
||||
|
||||
feats = self.pad_feats(feats, np.max(feats_len))
|
||||
feats_len = np.array(feats_len).astype(np.int32)
|
||||
return feats, feats_len
|
||||
|
||||
@staticmethod
|
||||
def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
|
||||
def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
|
||||
pad_width = ((0, max_feat_len - cur_len), (0, 0))
|
||||
return np.pad(feat, pad_width, 'constant', constant_values=0)
|
||||
|
||||
feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
|
||||
feats = np.array(feat_res).astype(np.float32)
|
||||
return feats
|
||||
|
||||
def infer(self, feats: np.ndarray,
|
||||
feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
outputs = self.ort_infer([feats, feats_len])
|
||||
return outputs
|
||||
|
||||
def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
|
||||
return [self.decode_one(am_score, token_num)
|
||||
for am_score, token_num in zip(am_scores, token_nums)]
|
||||
|
||||
def decode_one(self,
|
||||
am_score: np.ndarray,
|
||||
valid_token_num: int) -> List[str]:
|
||||
yseq = am_score.argmax(axis=-1)
|
||||
score = am_score.max(axis=-1)
|
||||
score = np.sum(score, axis=-1)
|
||||
|
||||
# pad with mask tokens to ensure compatibility with sos/eos tokens
|
||||
# asr_model.sos:1 asr_model.eos:2
|
||||
yseq = np.array([1] + yseq.tolist() + [2])
|
||||
hyp = Hypothesis(yseq=yseq, score=score)
|
||||
|
||||
# remove sos/eos and get results
|
||||
last_pos = -1
|
||||
token_int = hyp.yseq[1:last_pos].tolist()
|
||||
|
||||
# remove blank symbol id, which is assumed to be 0
|
||||
token_int = list(filter(lambda x: x not in (0, 2), token_int))
|
||||
|
||||
# Change integer-ids to tokens
|
||||
token = self.converter.ids2tokens(token_int)
|
||||
token = token[:valid_token_num-self.pred_bias]
|
||||
# texts = sentence_postprocess(token)
|
||||
return token
|
||||
|
||||
261
funasr_local/runtime/python/onnxruntime/funasr_onnx/punc_bin.py
Normal file
261
funasr_local/runtime/python/onnxruntime/funasr_onnx/punc_bin.py
Normal file
@@ -0,0 +1,261 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
# MIT License (https://opensource.org/licenses/MIT)
|
||||
|
||||
import os.path
|
||||
from pathlib import Path
|
||||
from typing import List, Union, Tuple
|
||||
import numpy as np
|
||||
|
||||
from .utils.utils import (ONNXRuntimeError,
|
||||
OrtInferSession, get_logger,
|
||||
read_yaml)
|
||||
from .utils.utils import (TokenIDConverter, split_to_mini_sentence,code_mix_split_words)
|
||||
logging = get_logger()
|
||||
|
||||
|
||||
class CT_Transformer():
|
||||
"""
|
||||
Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
CT-Transformer: Controllable time-delay transformer for real-time punctuation prediction and disfluency detection
|
||||
https://arxiv.org/pdf/2003.01309.pdf
|
||||
"""
|
||||
def __init__(self, model_dir: Union[str, Path] = None,
|
||||
batch_size: int = 1,
|
||||
device_id: Union[str, int] = "-1",
|
||||
quantize: bool = False,
|
||||
intra_op_num_threads: int = 4
|
||||
):
|
||||
|
||||
if not Path(model_dir).exists():
|
||||
raise FileNotFoundError(f'{model_dir} does not exist.')
|
||||
|
||||
model_file = os.path.join(model_dir, 'model.onnx')
|
||||
if quantize:
|
||||
model_file = os.path.join(model_dir, 'model_quant.onnx')
|
||||
config_file = os.path.join(model_dir, 'punc.yaml')
|
||||
config = read_yaml(config_file)
|
||||
|
||||
self.converter = TokenIDConverter(config['token_list'])
|
||||
self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads)
|
||||
self.batch_size = 1
|
||||
self.punc_list = config['punc_list']
|
||||
self.period = 0
|
||||
for i in range(len(self.punc_list)):
|
||||
if self.punc_list[i] == ",":
|
||||
self.punc_list[i] = ","
|
||||
elif self.punc_list[i] == "?":
|
||||
self.punc_list[i] = "?"
|
||||
elif self.punc_list[i] == "。":
|
||||
self.period = i
|
||||
|
||||
def __call__(self, text: Union[list, str], split_size=20):
|
||||
split_text = code_mix_split_words(text)
|
||||
split_text_id = self.converter.tokens2ids(split_text)
|
||||
mini_sentences = split_to_mini_sentence(split_text, split_size)
|
||||
mini_sentences_id = split_to_mini_sentence(split_text_id, split_size)
|
||||
assert len(mini_sentences) == len(mini_sentences_id)
|
||||
cache_sent = []
|
||||
cache_sent_id = []
|
||||
new_mini_sentence = ""
|
||||
new_mini_sentence_punc = []
|
||||
cache_pop_trigger_limit = 200
|
||||
for mini_sentence_i in range(len(mini_sentences)):
|
||||
mini_sentence = mini_sentences[mini_sentence_i]
|
||||
mini_sentence_id = mini_sentences_id[mini_sentence_i]
|
||||
mini_sentence = cache_sent + mini_sentence
|
||||
mini_sentence_id = np.array(cache_sent_id + mini_sentence_id, dtype='int64')
|
||||
data = {
|
||||
"text": mini_sentence_id[None,:],
|
||||
"text_lengths": np.array([len(mini_sentence_id)], dtype='int32'),
|
||||
}
|
||||
try:
|
||||
outputs = self.infer(data['text'], data['text_lengths'])
|
||||
y = outputs[0]
|
||||
punctuations = np.argmax(y,axis=-1)[0]
|
||||
assert punctuations.size == len(mini_sentence)
|
||||
except ONNXRuntimeError:
|
||||
logging.warning("error")
|
||||
|
||||
# Search for the last Period/QuestionMark as cache
|
||||
if mini_sentence_i < len(mini_sentences) - 1:
|
||||
sentenceEnd = -1
|
||||
last_comma_index = -1
|
||||
for i in range(len(punctuations) - 2, 1, -1):
|
||||
if self.punc_list[punctuations[i]] == "。" or self.punc_list[punctuations[i]] == "?":
|
||||
sentenceEnd = i
|
||||
break
|
||||
if last_comma_index < 0 and self.punc_list[punctuations[i]] == ",":
|
||||
last_comma_index = i
|
||||
|
||||
if sentenceEnd < 0 and len(mini_sentence) > cache_pop_trigger_limit and last_comma_index >= 0:
|
||||
# The sentence it too long, cut off at a comma.
|
||||
sentenceEnd = last_comma_index
|
||||
punctuations[sentenceEnd] = self.period
|
||||
cache_sent = mini_sentence[sentenceEnd + 1:]
|
||||
cache_sent_id = mini_sentence_id[sentenceEnd + 1:].tolist()
|
||||
mini_sentence = mini_sentence[0:sentenceEnd + 1]
|
||||
punctuations = punctuations[0:sentenceEnd + 1]
|
||||
|
||||
new_mini_sentence_punc += [int(x) for x in punctuations]
|
||||
words_with_punc = []
|
||||
for i in range(len(mini_sentence)):
|
||||
if i > 0:
|
||||
if len(mini_sentence[i][0].encode()) == 1 and len(mini_sentence[i - 1][0].encode()) == 1:
|
||||
mini_sentence[i] = " " + mini_sentence[i]
|
||||
words_with_punc.append(mini_sentence[i])
|
||||
if self.punc_list[punctuations[i]] != "_":
|
||||
words_with_punc.append(self.punc_list[punctuations[i]])
|
||||
new_mini_sentence += "".join(words_with_punc)
|
||||
# Add Period for the end of the sentence
|
||||
new_mini_sentence_out = new_mini_sentence
|
||||
new_mini_sentence_punc_out = new_mini_sentence_punc
|
||||
if mini_sentence_i == len(mini_sentences) - 1:
|
||||
if new_mini_sentence[-1] == "," or new_mini_sentence[-1] == "、":
|
||||
new_mini_sentence_out = new_mini_sentence[:-1] + "。"
|
||||
new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.period]
|
||||
elif new_mini_sentence[-1] != "。" and new_mini_sentence[-1] != "?":
|
||||
new_mini_sentence_out = new_mini_sentence + "。"
|
||||
new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.period]
|
||||
return new_mini_sentence_out, new_mini_sentence_punc_out
|
||||
|
||||
def infer(self, feats: np.ndarray,
|
||||
feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
outputs = self.ort_infer([feats, feats_len])
|
||||
return outputs
|
||||
|
||||
|
||||
class CT_Transformer_VadRealtime(CT_Transformer):
|
||||
"""
|
||||
Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
CT-Transformer: Controllable time-delay transformer for real-time punctuation prediction and disfluency detection
|
||||
https://arxiv.org/pdf/2003.01309.pdf
|
||||
"""
|
||||
def __init__(self, model_dir: Union[str, Path] = None,
|
||||
batch_size: int = 1,
|
||||
device_id: Union[str, int] = "-1",
|
||||
quantize: bool = False,
|
||||
intra_op_num_threads: int = 4
|
||||
):
|
||||
super(CT_Transformer_VadRealtime, self).__init__(model_dir, batch_size, device_id, quantize, intra_op_num_threads)
|
||||
|
||||
def __call__(self, text: str, param_dict: map, split_size=20):
|
||||
cache_key = "cache"
|
||||
assert cache_key in param_dict
|
||||
cache = param_dict[cache_key]
|
||||
if cache is not None and len(cache) > 0:
|
||||
precache = "".join(cache)
|
||||
else:
|
||||
precache = ""
|
||||
cache = []
|
||||
full_text = precache + text
|
||||
split_text = code_mix_split_words(full_text)
|
||||
split_text_id = self.converter.tokens2ids(split_text)
|
||||
mini_sentences = split_to_mini_sentence(split_text, split_size)
|
||||
mini_sentences_id = split_to_mini_sentence(split_text_id, split_size)
|
||||
new_mini_sentence_punc = []
|
||||
assert len(mini_sentences) == len(mini_sentences_id)
|
||||
|
||||
cache_sent = []
|
||||
cache_sent_id = np.array([], dtype='int32')
|
||||
sentence_punc_list = []
|
||||
sentence_words_list = []
|
||||
cache_pop_trigger_limit = 200
|
||||
skip_num = 0
|
||||
for mini_sentence_i in range(len(mini_sentences)):
|
||||
mini_sentence = mini_sentences[mini_sentence_i]
|
||||
mini_sentence_id = mini_sentences_id[mini_sentence_i]
|
||||
mini_sentence = cache_sent + mini_sentence
|
||||
mini_sentence_id = np.concatenate((cache_sent_id, mini_sentence_id), axis=0)
|
||||
text_length = len(mini_sentence_id)
|
||||
data = {
|
||||
"input": mini_sentence_id[None,:],
|
||||
"text_lengths": np.array([text_length], dtype='int32'),
|
||||
"vad_mask": self.vad_mask(text_length, len(cache))[None, None, :, :].astype(np.float32),
|
||||
"sub_masks": np.tril(np.ones((text_length, text_length), dtype=np.float32))[None, None, :, :].astype(np.float32)
|
||||
}
|
||||
try:
|
||||
outputs = self.infer(data['input'], data['text_lengths'], data['vad_mask'], data["sub_masks"])
|
||||
y = outputs[0]
|
||||
punctuations = np.argmax(y,axis=-1)[0]
|
||||
assert punctuations.size == len(mini_sentence)
|
||||
except ONNXRuntimeError:
|
||||
logging.warning("error")
|
||||
|
||||
# Search for the last Period/QuestionMark as cache
|
||||
if mini_sentence_i < len(mini_sentences) - 1:
|
||||
sentenceEnd = -1
|
||||
last_comma_index = -1
|
||||
for i in range(len(punctuations) - 2, 1, -1):
|
||||
if self.punc_list[punctuations[i]] == "。" or self.punc_list[punctuations[i]] == "?":
|
||||
sentenceEnd = i
|
||||
break
|
||||
if last_comma_index < 0 and self.punc_list[punctuations[i]] == ",":
|
||||
last_comma_index = i
|
||||
|
||||
if sentenceEnd < 0 and len(mini_sentence) > cache_pop_trigger_limit and last_comma_index >= 0:
|
||||
# The sentence it too long, cut off at a comma.
|
||||
sentenceEnd = last_comma_index
|
||||
punctuations[sentenceEnd] = self.period
|
||||
cache_sent = mini_sentence[sentenceEnd + 1:]
|
||||
cache_sent_id = mini_sentence_id[sentenceEnd + 1:]
|
||||
mini_sentence = mini_sentence[0:sentenceEnd + 1]
|
||||
punctuations = punctuations[0:sentenceEnd + 1]
|
||||
|
||||
punctuations_np = [int(x) for x in punctuations]
|
||||
new_mini_sentence_punc += punctuations_np
|
||||
sentence_punc_list += [self.punc_list[int(x)] for x in punctuations_np]
|
||||
sentence_words_list += mini_sentence
|
||||
|
||||
assert len(sentence_punc_list) == len(sentence_words_list)
|
||||
words_with_punc = []
|
||||
sentence_punc_list_out = []
|
||||
for i in range(0, len(sentence_words_list)):
|
||||
if i > 0:
|
||||
if len(sentence_words_list[i][0].encode()) == 1 and len(sentence_words_list[i - 1][-1].encode()) == 1:
|
||||
sentence_words_list[i] = " " + sentence_words_list[i]
|
||||
if skip_num < len(cache):
|
||||
skip_num += 1
|
||||
else:
|
||||
words_with_punc.append(sentence_words_list[i])
|
||||
if skip_num >= len(cache):
|
||||
sentence_punc_list_out.append(sentence_punc_list[i])
|
||||
if sentence_punc_list[i] != "_":
|
||||
words_with_punc.append(sentence_punc_list[i])
|
||||
sentence_out = "".join(words_with_punc)
|
||||
|
||||
sentenceEnd = -1
|
||||
for i in range(len(sentence_punc_list) - 2, 1, -1):
|
||||
if sentence_punc_list[i] == "。" or sentence_punc_list[i] == "?":
|
||||
sentenceEnd = i
|
||||
break
|
||||
cache_out = sentence_words_list[sentenceEnd + 1:]
|
||||
if sentence_out[-1] in self.punc_list:
|
||||
sentence_out = sentence_out[:-1]
|
||||
sentence_punc_list_out[-1] = "_"
|
||||
param_dict[cache_key] = cache_out
|
||||
return sentence_out, sentence_punc_list_out, cache_out
|
||||
|
||||
def vad_mask(self, size, vad_pos, dtype=np.bool):
|
||||
"""Create mask for decoder self-attention.
|
||||
|
||||
:param int size: size of mask
|
||||
:param int vad_pos: index of vad index
|
||||
:param torch.dtype dtype: result dtype
|
||||
:rtype: torch.Tensor (B, Lmax, Lmax)
|
||||
"""
|
||||
ret = np.ones((size, size), dtype=dtype)
|
||||
if vad_pos <= 0 or vad_pos >= size:
|
||||
return ret
|
||||
sub_corner = np.zeros(
|
||||
(vad_pos - 1, size - vad_pos), dtype=dtype)
|
||||
ret[0:vad_pos - 1, vad_pos:] = sub_corner
|
||||
return ret
|
||||
|
||||
def infer(self, feats: np.ndarray,
|
||||
feats_len: np.ndarray,
|
||||
vad_mask: np.ndarray,
|
||||
sub_masks: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
outputs = self.ort_infer([feats, feats_len, vad_mask, sub_masks])
|
||||
return outputs
|
||||
|
||||
@@ -0,0 +1,624 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
# MIT License (https://opensource.org/licenses/MIT)
|
||||
|
||||
from enum import Enum
|
||||
from typing import List, Tuple, Dict, Any
|
||||
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
class VadStateMachine(Enum):
|
||||
kVadInStateStartPointNotDetected = 1
|
||||
kVadInStateInSpeechSegment = 2
|
||||
kVadInStateEndPointDetected = 3
|
||||
|
||||
|
||||
class FrameState(Enum):
|
||||
kFrameStateInvalid = -1
|
||||
kFrameStateSpeech = 1
|
||||
kFrameStateSil = 0
|
||||
|
||||
|
||||
# final voice/unvoice state per frame
|
||||
class AudioChangeState(Enum):
|
||||
kChangeStateSpeech2Speech = 0
|
||||
kChangeStateSpeech2Sil = 1
|
||||
kChangeStateSil2Sil = 2
|
||||
kChangeStateSil2Speech = 3
|
||||
kChangeStateNoBegin = 4
|
||||
kChangeStateInvalid = 5
|
||||
|
||||
|
||||
class VadDetectMode(Enum):
|
||||
kVadSingleUtteranceDetectMode = 0
|
||||
kVadMutipleUtteranceDetectMode = 1
|
||||
|
||||
|
||||
class VADXOptions:
|
||||
def __init__(
|
||||
self,
|
||||
sample_rate: int = 16000,
|
||||
detect_mode: int = VadDetectMode.kVadMutipleUtteranceDetectMode.value,
|
||||
snr_mode: int = 0,
|
||||
max_end_silence_time: int = 800,
|
||||
max_start_silence_time: int = 3000,
|
||||
do_start_point_detection: bool = True,
|
||||
do_end_point_detection: bool = True,
|
||||
window_size_ms: int = 200,
|
||||
sil_to_speech_time_thres: int = 150,
|
||||
speech_to_sil_time_thres: int = 150,
|
||||
speech_2_noise_ratio: float = 1.0,
|
||||
do_extend: int = 1,
|
||||
lookback_time_start_point: int = 200,
|
||||
lookahead_time_end_point: int = 100,
|
||||
max_single_segment_time: int = 60000,
|
||||
nn_eval_block_size: int = 8,
|
||||
dcd_block_size: int = 4,
|
||||
snr_thres: int = -100.0,
|
||||
noise_frame_num_used_for_snr: int = 100,
|
||||
decibel_thres: int = -100.0,
|
||||
speech_noise_thres: float = 0.6,
|
||||
fe_prior_thres: float = 1e-4,
|
||||
silence_pdf_num: int = 1,
|
||||
sil_pdf_ids: List[int] = [0],
|
||||
speech_noise_thresh_low: float = -0.1,
|
||||
speech_noise_thresh_high: float = 0.3,
|
||||
output_frame_probs: bool = False,
|
||||
frame_in_ms: int = 10,
|
||||
frame_length_ms: int = 25,
|
||||
):
|
||||
self.sample_rate = sample_rate
|
||||
self.detect_mode = detect_mode
|
||||
self.snr_mode = snr_mode
|
||||
self.max_end_silence_time = max_end_silence_time
|
||||
self.max_start_silence_time = max_start_silence_time
|
||||
self.do_start_point_detection = do_start_point_detection
|
||||
self.do_end_point_detection = do_end_point_detection
|
||||
self.window_size_ms = window_size_ms
|
||||
self.sil_to_speech_time_thres = sil_to_speech_time_thres
|
||||
self.speech_to_sil_time_thres = speech_to_sil_time_thres
|
||||
self.speech_2_noise_ratio = speech_2_noise_ratio
|
||||
self.do_extend = do_extend
|
||||
self.lookback_time_start_point = lookback_time_start_point
|
||||
self.lookahead_time_end_point = lookahead_time_end_point
|
||||
self.max_single_segment_time = max_single_segment_time
|
||||
self.nn_eval_block_size = nn_eval_block_size
|
||||
self.dcd_block_size = dcd_block_size
|
||||
self.snr_thres = snr_thres
|
||||
self.noise_frame_num_used_for_snr = noise_frame_num_used_for_snr
|
||||
self.decibel_thres = decibel_thres
|
||||
self.speech_noise_thres = speech_noise_thres
|
||||
self.fe_prior_thres = fe_prior_thres
|
||||
self.silence_pdf_num = silence_pdf_num
|
||||
self.sil_pdf_ids = sil_pdf_ids
|
||||
self.speech_noise_thresh_low = speech_noise_thresh_low
|
||||
self.speech_noise_thresh_high = speech_noise_thresh_high
|
||||
self.output_frame_probs = output_frame_probs
|
||||
self.frame_in_ms = frame_in_ms
|
||||
self.frame_length_ms = frame_length_ms
|
||||
|
||||
|
||||
class E2EVadSpeechBufWithDoa(object):
|
||||
def __init__(self):
|
||||
self.start_ms = 0
|
||||
self.end_ms = 0
|
||||
self.buffer = []
|
||||
self.contain_seg_start_point = False
|
||||
self.contain_seg_end_point = False
|
||||
self.doa = 0
|
||||
|
||||
def Reset(self):
|
||||
self.start_ms = 0
|
||||
self.end_ms = 0
|
||||
self.buffer = []
|
||||
self.contain_seg_start_point = False
|
||||
self.contain_seg_end_point = False
|
||||
self.doa = 0
|
||||
|
||||
|
||||
class E2EVadFrameProb(object):
|
||||
def __init__(self):
|
||||
self.noise_prob = 0.0
|
||||
self.speech_prob = 0.0
|
||||
self.score = 0.0
|
||||
self.frame_id = 0
|
||||
self.frm_state = 0
|
||||
|
||||
|
||||
class WindowDetector(object):
|
||||
def __init__(self, window_size_ms: int, sil_to_speech_time: int,
|
||||
speech_to_sil_time: int, frame_size_ms: int):
|
||||
self.window_size_ms = window_size_ms
|
||||
self.sil_to_speech_time = sil_to_speech_time
|
||||
self.speech_to_sil_time = speech_to_sil_time
|
||||
self.frame_size_ms = frame_size_ms
|
||||
|
||||
self.win_size_frame = int(window_size_ms / frame_size_ms)
|
||||
self.win_sum = 0
|
||||
self.win_state = [0] * self.win_size_frame # 初始化窗
|
||||
|
||||
self.cur_win_pos = 0
|
||||
self.pre_frame_state = FrameState.kFrameStateSil
|
||||
self.cur_frame_state = FrameState.kFrameStateSil
|
||||
self.sil_to_speech_frmcnt_thres = int(sil_to_speech_time / frame_size_ms)
|
||||
self.speech_to_sil_frmcnt_thres = int(speech_to_sil_time / frame_size_ms)
|
||||
|
||||
self.voice_last_frame_count = 0
|
||||
self.noise_last_frame_count = 0
|
||||
self.hydre_frame_count = 0
|
||||
|
||||
def Reset(self) -> None:
|
||||
self.cur_win_pos = 0
|
||||
self.win_sum = 0
|
||||
self.win_state = [0] * self.win_size_frame
|
||||
self.pre_frame_state = FrameState.kFrameStateSil
|
||||
self.cur_frame_state = FrameState.kFrameStateSil
|
||||
self.voice_last_frame_count = 0
|
||||
self.noise_last_frame_count = 0
|
||||
self.hydre_frame_count = 0
|
||||
|
||||
def GetWinSize(self) -> int:
|
||||
return int(self.win_size_frame)
|
||||
|
||||
def DetectOneFrame(self, frameState: FrameState, frame_count: int) -> AudioChangeState:
|
||||
cur_frame_state = FrameState.kFrameStateSil
|
||||
if frameState == FrameState.kFrameStateSpeech:
|
||||
cur_frame_state = 1
|
||||
elif frameState == FrameState.kFrameStateSil:
|
||||
cur_frame_state = 0
|
||||
else:
|
||||
return AudioChangeState.kChangeStateInvalid
|
||||
self.win_sum -= self.win_state[self.cur_win_pos]
|
||||
self.win_sum += cur_frame_state
|
||||
self.win_state[self.cur_win_pos] = cur_frame_state
|
||||
self.cur_win_pos = (self.cur_win_pos + 1) % self.win_size_frame
|
||||
|
||||
if self.pre_frame_state == FrameState.kFrameStateSil and self.win_sum >= self.sil_to_speech_frmcnt_thres:
|
||||
self.pre_frame_state = FrameState.kFrameStateSpeech
|
||||
return AudioChangeState.kChangeStateSil2Speech
|
||||
|
||||
if self.pre_frame_state == FrameState.kFrameStateSpeech and self.win_sum <= self.speech_to_sil_frmcnt_thres:
|
||||
self.pre_frame_state = FrameState.kFrameStateSil
|
||||
return AudioChangeState.kChangeStateSpeech2Sil
|
||||
|
||||
if self.pre_frame_state == FrameState.kFrameStateSil:
|
||||
return AudioChangeState.kChangeStateSil2Sil
|
||||
if self.pre_frame_state == FrameState.kFrameStateSpeech:
|
||||
return AudioChangeState.kChangeStateSpeech2Speech
|
||||
return AudioChangeState.kChangeStateInvalid
|
||||
|
||||
def FrameSizeMs(self) -> int:
|
||||
return int(self.frame_size_ms)
|
||||
|
||||
|
||||
class E2EVadModel():
|
||||
"""
|
||||
Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
Deep-FSMN for Large Vocabulary Continuous Speech Recognition
|
||||
https://arxiv.org/abs/1803.05030
|
||||
"""
|
||||
def __init__(self, vad_post_args: Dict[str, Any]):
|
||||
super(E2EVadModel, self).__init__()
|
||||
self.vad_opts = VADXOptions(**vad_post_args)
|
||||
self.windows_detector = WindowDetector(self.vad_opts.window_size_ms,
|
||||
self.vad_opts.sil_to_speech_time_thres,
|
||||
self.vad_opts.speech_to_sil_time_thres,
|
||||
self.vad_opts.frame_in_ms)
|
||||
# self.encoder = encoder
|
||||
# init variables
|
||||
self.is_final = False
|
||||
self.data_buf_start_frame = 0
|
||||
self.frm_cnt = 0
|
||||
self.latest_confirmed_speech_frame = 0
|
||||
self.lastest_confirmed_silence_frame = -1
|
||||
self.continous_silence_frame_count = 0
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
|
||||
self.confirmed_start_frame = -1
|
||||
self.confirmed_end_frame = -1
|
||||
self.number_end_time_detected = 0
|
||||
self.sil_frame = 0
|
||||
self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
|
||||
self.noise_average_decibel = -100.0
|
||||
self.pre_end_silence_detected = False
|
||||
self.next_seg = True
|
||||
|
||||
self.output_data_buf = []
|
||||
self.output_data_buf_offset = 0
|
||||
self.frame_probs = []
|
||||
self.max_end_sil_frame_cnt_thresh = self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
|
||||
self.speech_noise_thres = self.vad_opts.speech_noise_thres
|
||||
self.scores = None
|
||||
self.idx_pre_chunk = 0
|
||||
self.max_time_out = False
|
||||
self.decibel = []
|
||||
self.data_buf_size = 0
|
||||
self.data_buf_all_size = 0
|
||||
self.waveform = None
|
||||
self.ResetDetection()
|
||||
|
||||
def AllResetDetection(self):
|
||||
self.is_final = False
|
||||
self.data_buf_start_frame = 0
|
||||
self.frm_cnt = 0
|
||||
self.latest_confirmed_speech_frame = 0
|
||||
self.lastest_confirmed_silence_frame = -1
|
||||
self.continous_silence_frame_count = 0
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
|
||||
self.confirmed_start_frame = -1
|
||||
self.confirmed_end_frame = -1
|
||||
self.number_end_time_detected = 0
|
||||
self.sil_frame = 0
|
||||
self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
|
||||
self.noise_average_decibel = -100.0
|
||||
self.pre_end_silence_detected = False
|
||||
self.next_seg = True
|
||||
|
||||
self.output_data_buf = []
|
||||
self.output_data_buf_offset = 0
|
||||
self.frame_probs = []
|
||||
self.max_end_sil_frame_cnt_thresh = self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
|
||||
self.speech_noise_thres = self.vad_opts.speech_noise_thres
|
||||
self.scores = None
|
||||
self.idx_pre_chunk = 0
|
||||
self.max_time_out = False
|
||||
self.decibel = []
|
||||
self.data_buf_size = 0
|
||||
self.data_buf_all_size = 0
|
||||
self.waveform = None
|
||||
self.ResetDetection()
|
||||
|
||||
def ResetDetection(self):
|
||||
self.continous_silence_frame_count = 0
|
||||
self.latest_confirmed_speech_frame = 0
|
||||
self.lastest_confirmed_silence_frame = -1
|
||||
self.confirmed_start_frame = -1
|
||||
self.confirmed_end_frame = -1
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
|
||||
self.windows_detector.Reset()
|
||||
self.sil_frame = 0
|
||||
self.frame_probs = []
|
||||
|
||||
def ComputeDecibel(self) -> None:
|
||||
frame_sample_length = int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000)
|
||||
frame_shift_length = int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
|
||||
if self.data_buf_all_size == 0:
|
||||
self.data_buf_all_size = len(self.waveform[0])
|
||||
self.data_buf_size = self.data_buf_all_size
|
||||
else:
|
||||
self.data_buf_all_size += len(self.waveform[0])
|
||||
for offset in range(0, self.waveform.shape[1] - frame_sample_length + 1, frame_shift_length):
|
||||
self.decibel.append(
|
||||
10 * math.log10(np.square((self.waveform[0][offset: offset + frame_sample_length])).sum() + \
|
||||
0.000001))
|
||||
|
||||
def ComputeScores(self, scores: np.ndarray) -> None:
|
||||
# scores = self.encoder(feats, in_cache) # return B * T * D
|
||||
self.vad_opts.nn_eval_block_size = scores.shape[1]
|
||||
self.frm_cnt += scores.shape[1] # count total frames
|
||||
self.scores=scores
|
||||
|
||||
def PopDataBufTillFrame(self, frame_idx: int) -> None: # need check again
|
||||
while self.data_buf_start_frame < frame_idx:
|
||||
if self.data_buf_size >= int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):
|
||||
self.data_buf_start_frame += 1
|
||||
self.data_buf_size = self.data_buf_all_size-self.data_buf_start_frame * int(
|
||||
self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
|
||||
|
||||
def PopDataToOutputBuf(self, start_frm: int, frm_cnt: int, first_frm_is_start_point: bool,
|
||||
last_frm_is_end_point: bool, end_point_is_sent_end: bool) -> None:
|
||||
self.PopDataBufTillFrame(start_frm)
|
||||
expected_sample_number = int(frm_cnt * self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000)
|
||||
if last_frm_is_end_point:
|
||||
extra_sample = max(0, int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000 - \
|
||||
self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000))
|
||||
expected_sample_number += int(extra_sample)
|
||||
if end_point_is_sent_end:
|
||||
expected_sample_number = max(expected_sample_number, self.data_buf_size)
|
||||
if self.data_buf_size < expected_sample_number:
|
||||
print('error in calling pop data_buf\n')
|
||||
|
||||
if len(self.output_data_buf) == 0 or first_frm_is_start_point:
|
||||
self.output_data_buf.append(E2EVadSpeechBufWithDoa())
|
||||
self.output_data_buf[-1].Reset()
|
||||
self.output_data_buf[-1].start_ms = start_frm * self.vad_opts.frame_in_ms
|
||||
self.output_data_buf[-1].end_ms = self.output_data_buf[-1].start_ms
|
||||
self.output_data_buf[-1].doa = 0
|
||||
cur_seg = self.output_data_buf[-1]
|
||||
if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
|
||||
print('warning\n')
|
||||
out_pos = len(cur_seg.buffer) # cur_seg.buff现在没做任何操作
|
||||
data_to_pop = 0
|
||||
if end_point_is_sent_end:
|
||||
data_to_pop = expected_sample_number
|
||||
else:
|
||||
data_to_pop = int(frm_cnt * self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
|
||||
if data_to_pop > self.data_buf_size:
|
||||
print('VAD data_to_pop is bigger than self.data_buf_size!!!\n')
|
||||
data_to_pop = self.data_buf_size
|
||||
expected_sample_number = self.data_buf_size
|
||||
|
||||
cur_seg.doa = 0
|
||||
for sample_cpy_out in range(0, data_to_pop):
|
||||
# cur_seg.buffer[out_pos ++] = data_buf_.back();
|
||||
out_pos += 1
|
||||
for sample_cpy_out in range(data_to_pop, expected_sample_number):
|
||||
# cur_seg.buffer[out_pos++] = data_buf_.back()
|
||||
out_pos += 1
|
||||
if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
|
||||
print('Something wrong with the VAD algorithm\n')
|
||||
self.data_buf_start_frame += frm_cnt
|
||||
cur_seg.end_ms = (start_frm + frm_cnt) * self.vad_opts.frame_in_ms
|
||||
if first_frm_is_start_point:
|
||||
cur_seg.contain_seg_start_point = True
|
||||
if last_frm_is_end_point:
|
||||
cur_seg.contain_seg_end_point = True
|
||||
|
||||
def OnSilenceDetected(self, valid_frame: int):
|
||||
self.lastest_confirmed_silence_frame = valid_frame
|
||||
if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
|
||||
self.PopDataBufTillFrame(valid_frame)
|
||||
# silence_detected_callback_
|
||||
# pass
|
||||
|
||||
def OnVoiceDetected(self, valid_frame: int) -> None:
|
||||
self.latest_confirmed_speech_frame = valid_frame
|
||||
self.PopDataToOutputBuf(valid_frame, 1, False, False, False)
|
||||
|
||||
def OnVoiceStart(self, start_frame: int, fake_result: bool = False) -> None:
|
||||
if self.vad_opts.do_start_point_detection:
|
||||
pass
|
||||
if self.confirmed_start_frame != -1:
|
||||
print('not reset vad properly\n')
|
||||
else:
|
||||
self.confirmed_start_frame = start_frame
|
||||
|
||||
if not fake_result and self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
|
||||
self.PopDataToOutputBuf(self.confirmed_start_frame, 1, True, False, False)
|
||||
|
||||
def OnVoiceEnd(self, end_frame: int, fake_result: bool, is_last_frame: bool) -> None:
|
||||
for t in range(self.latest_confirmed_speech_frame + 1, end_frame):
|
||||
self.OnVoiceDetected(t)
|
||||
if self.vad_opts.do_end_point_detection:
|
||||
pass
|
||||
if self.confirmed_end_frame != -1:
|
||||
print('not reset vad properly\n')
|
||||
else:
|
||||
self.confirmed_end_frame = end_frame
|
||||
if not fake_result:
|
||||
self.sil_frame = 0
|
||||
self.PopDataToOutputBuf(self.confirmed_end_frame, 1, False, True, is_last_frame)
|
||||
self.number_end_time_detected += 1
|
||||
|
||||
def MaybeOnVoiceEndIfLastFrame(self, is_final_frame: bool, cur_frm_idx: int) -> None:
|
||||
if is_final_frame:
|
||||
self.OnVoiceEnd(cur_frm_idx, False, True)
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
|
||||
|
||||
def GetLatency(self) -> int:
|
||||
return int(self.LatencyFrmNumAtStartPoint() * self.vad_opts.frame_in_ms)
|
||||
|
||||
def LatencyFrmNumAtStartPoint(self) -> int:
|
||||
vad_latency = self.windows_detector.GetWinSize()
|
||||
if self.vad_opts.do_extend:
|
||||
vad_latency += int(self.vad_opts.lookback_time_start_point / self.vad_opts.frame_in_ms)
|
||||
return vad_latency
|
||||
|
||||
def GetFrameState(self, t: int) -> FrameState:
|
||||
frame_state = FrameState.kFrameStateInvalid
|
||||
cur_decibel = self.decibel[t]
|
||||
cur_snr = cur_decibel - self.noise_average_decibel
|
||||
# for each frame, calc log posterior probability of each state
|
||||
if cur_decibel < self.vad_opts.decibel_thres:
|
||||
frame_state = FrameState.kFrameStateSil
|
||||
self.DetectOneFrame(frame_state, t, False)
|
||||
return frame_state
|
||||
|
||||
sum_score = 0.0
|
||||
noise_prob = 0.0
|
||||
assert len(self.sil_pdf_ids) == self.vad_opts.silence_pdf_num
|
||||
if len(self.sil_pdf_ids) > 0:
|
||||
assert len(self.scores) == 1 # 只支持batch_size = 1的测试
|
||||
sil_pdf_scores = [self.scores[0][t - self.idx_pre_chunk][sil_pdf_id] for sil_pdf_id in self.sil_pdf_ids]
|
||||
sum_score = sum(sil_pdf_scores)
|
||||
noise_prob = math.log(sum_score) * self.vad_opts.speech_2_noise_ratio
|
||||
total_score = 1.0
|
||||
sum_score = total_score - sum_score
|
||||
speech_prob = math.log(sum_score)
|
||||
if self.vad_opts.output_frame_probs:
|
||||
frame_prob = E2EVadFrameProb()
|
||||
frame_prob.noise_prob = noise_prob
|
||||
frame_prob.speech_prob = speech_prob
|
||||
frame_prob.score = sum_score
|
||||
frame_prob.frame_id = t
|
||||
self.frame_probs.append(frame_prob)
|
||||
if math.exp(speech_prob) >= math.exp(noise_prob) + self.speech_noise_thres:
|
||||
if cur_snr >= self.vad_opts.snr_thres and cur_decibel >= self.vad_opts.decibel_thres:
|
||||
frame_state = FrameState.kFrameStateSpeech
|
||||
else:
|
||||
frame_state = FrameState.kFrameStateSil
|
||||
else:
|
||||
frame_state = FrameState.kFrameStateSil
|
||||
if self.noise_average_decibel < -99.9:
|
||||
self.noise_average_decibel = cur_decibel
|
||||
else:
|
||||
self.noise_average_decibel = (cur_decibel + self.noise_average_decibel * (
|
||||
self.vad_opts.noise_frame_num_used_for_snr
|
||||
- 1)) / self.vad_opts.noise_frame_num_used_for_snr
|
||||
|
||||
return frame_state
|
||||
|
||||
def __call__(self, score: np.ndarray, waveform: np.ndarray,
|
||||
is_final: bool = False, max_end_sil: int = 800, online: bool = False
|
||||
):
|
||||
self.max_end_sil_frame_cnt_thresh = max_end_sil - self.vad_opts.speech_to_sil_time_thres
|
||||
self.waveform = waveform # compute decibel for each frame
|
||||
self.ComputeDecibel()
|
||||
self.ComputeScores(score)
|
||||
if not is_final:
|
||||
self.DetectCommonFrames()
|
||||
else:
|
||||
self.DetectLastFrames()
|
||||
segments = []
|
||||
for batch_num in range(0, score.shape[0]): # only support batch_size = 1 now
|
||||
segment_batch = []
|
||||
if len(self.output_data_buf) > 0:
|
||||
for i in range(self.output_data_buf_offset, len(self.output_data_buf)):
|
||||
if online:
|
||||
if not self.output_data_buf[i].contain_seg_start_point:
|
||||
continue
|
||||
if not self.next_seg and not self.output_data_buf[i].contain_seg_end_point:
|
||||
continue
|
||||
start_ms = self.output_data_buf[i].start_ms if self.next_seg else -1
|
||||
if self.output_data_buf[i].contain_seg_end_point:
|
||||
end_ms = self.output_data_buf[i].end_ms
|
||||
self.next_seg = True
|
||||
self.output_data_buf_offset += 1
|
||||
else:
|
||||
end_ms = -1
|
||||
self.next_seg = False
|
||||
else:
|
||||
if not is_final and (not self.output_data_buf[i].contain_seg_start_point or not self.output_data_buf[
|
||||
i].contain_seg_end_point):
|
||||
continue
|
||||
start_ms = self.output_data_buf[i].start_ms
|
||||
end_ms = self.output_data_buf[i].end_ms
|
||||
self.output_data_buf_offset += 1
|
||||
segment = [start_ms, end_ms]
|
||||
segment_batch.append(segment)
|
||||
|
||||
if segment_batch:
|
||||
segments.append(segment_batch)
|
||||
if is_final:
|
||||
# reset class variables and clear the dict for the next query
|
||||
self.AllResetDetection()
|
||||
return segments
|
||||
|
||||
def DetectCommonFrames(self) -> int:
|
||||
if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
|
||||
return 0
|
||||
for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
|
||||
frame_state = FrameState.kFrameStateInvalid
|
||||
frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
|
||||
self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
|
||||
self.idx_pre_chunk += self.scores.shape[1]
|
||||
return 0
|
||||
|
||||
def DetectLastFrames(self) -> int:
|
||||
if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
|
||||
return 0
|
||||
for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
|
||||
frame_state = FrameState.kFrameStateInvalid
|
||||
frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
|
||||
if i != 0:
|
||||
self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
|
||||
else:
|
||||
self.DetectOneFrame(frame_state, self.frm_cnt - 1, True)
|
||||
|
||||
return 0
|
||||
|
||||
def DetectOneFrame(self, cur_frm_state: FrameState, cur_frm_idx: int, is_final_frame: bool) -> None:
|
||||
tmp_cur_frm_state = FrameState.kFrameStateInvalid
|
||||
if cur_frm_state == FrameState.kFrameStateSpeech:
|
||||
if math.fabs(1.0) > self.vad_opts.fe_prior_thres:
|
||||
tmp_cur_frm_state = FrameState.kFrameStateSpeech
|
||||
else:
|
||||
tmp_cur_frm_state = FrameState.kFrameStateSil
|
||||
elif cur_frm_state == FrameState.kFrameStateSil:
|
||||
tmp_cur_frm_state = FrameState.kFrameStateSil
|
||||
state_change = self.windows_detector.DetectOneFrame(tmp_cur_frm_state, cur_frm_idx)
|
||||
frm_shift_in_ms = self.vad_opts.frame_in_ms
|
||||
if AudioChangeState.kChangeStateSil2Speech == state_change:
|
||||
silence_frame_count = self.continous_silence_frame_count
|
||||
self.continous_silence_frame_count = 0
|
||||
self.pre_end_silence_detected = False
|
||||
start_frame = 0
|
||||
if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
|
||||
start_frame = max(self.data_buf_start_frame, cur_frm_idx - self.LatencyFrmNumAtStartPoint())
|
||||
self.OnVoiceStart(start_frame)
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateInSpeechSegment
|
||||
for t in range(start_frame + 1, cur_frm_idx + 1):
|
||||
self.OnVoiceDetected(t)
|
||||
elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
|
||||
for t in range(self.latest_confirmed_speech_frame + 1, cur_frm_idx):
|
||||
self.OnVoiceDetected(t)
|
||||
if cur_frm_idx - self.confirmed_start_frame + 1 > \
|
||||
self.vad_opts.max_single_segment_time / frm_shift_in_ms:
|
||||
self.OnVoiceEnd(cur_frm_idx, False, False)
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
|
||||
elif not is_final_frame:
|
||||
self.OnVoiceDetected(cur_frm_idx)
|
||||
else:
|
||||
self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
|
||||
else:
|
||||
pass
|
||||
elif AudioChangeState.kChangeStateSpeech2Sil == state_change:
|
||||
self.continous_silence_frame_count = 0
|
||||
if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
|
||||
pass
|
||||
elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
|
||||
if cur_frm_idx - self.confirmed_start_frame + 1 > \
|
||||
self.vad_opts.max_single_segment_time / frm_shift_in_ms:
|
||||
self.OnVoiceEnd(cur_frm_idx, False, False)
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
|
||||
elif not is_final_frame:
|
||||
self.OnVoiceDetected(cur_frm_idx)
|
||||
else:
|
||||
self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
|
||||
else:
|
||||
pass
|
||||
elif AudioChangeState.kChangeStateSpeech2Speech == state_change:
|
||||
self.continous_silence_frame_count = 0
|
||||
if self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
|
||||
if cur_frm_idx - self.confirmed_start_frame + 1 > \
|
||||
self.vad_opts.max_single_segment_time / frm_shift_in_ms:
|
||||
self.max_time_out = True
|
||||
self.OnVoiceEnd(cur_frm_idx, False, False)
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
|
||||
elif not is_final_frame:
|
||||
self.OnVoiceDetected(cur_frm_idx)
|
||||
else:
|
||||
self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
|
||||
else:
|
||||
pass
|
||||
elif AudioChangeState.kChangeStateSil2Sil == state_change:
|
||||
self.continous_silence_frame_count += 1
|
||||
if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
|
||||
# silence timeout, return zero length decision
|
||||
if ((self.vad_opts.detect_mode == VadDetectMode.kVadSingleUtteranceDetectMode.value) and (
|
||||
self.continous_silence_frame_count * frm_shift_in_ms > self.vad_opts.max_start_silence_time)) \
|
||||
or (is_final_frame and self.number_end_time_detected == 0):
|
||||
for t in range(self.lastest_confirmed_silence_frame + 1, cur_frm_idx):
|
||||
self.OnSilenceDetected(t)
|
||||
self.OnVoiceStart(0, True)
|
||||
self.OnVoiceEnd(0, True, False);
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
|
||||
else:
|
||||
if cur_frm_idx >= self.LatencyFrmNumAtStartPoint():
|
||||
self.OnSilenceDetected(cur_frm_idx - self.LatencyFrmNumAtStartPoint())
|
||||
elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
|
||||
if self.continous_silence_frame_count * frm_shift_in_ms >= self.max_end_sil_frame_cnt_thresh:
|
||||
lookback_frame = int(self.max_end_sil_frame_cnt_thresh / frm_shift_in_ms)
|
||||
if self.vad_opts.do_extend:
|
||||
lookback_frame -= int(self.vad_opts.lookahead_time_end_point / frm_shift_in_ms)
|
||||
lookback_frame -= 1
|
||||
lookback_frame = max(0, lookback_frame)
|
||||
self.OnVoiceEnd(cur_frm_idx - lookback_frame, False, False)
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
|
||||
elif cur_frm_idx - self.confirmed_start_frame + 1 > \
|
||||
self.vad_opts.max_single_segment_time / frm_shift_in_ms:
|
||||
self.OnVoiceEnd(cur_frm_idx, False, False)
|
||||
self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
|
||||
elif self.vad_opts.do_extend and not is_final_frame:
|
||||
if self.continous_silence_frame_count <= int(
|
||||
self.vad_opts.lookahead_time_end_point / frm_shift_in_ms):
|
||||
self.OnVoiceDetected(cur_frm_idx)
|
||||
else:
|
||||
self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
|
||||
else:
|
||||
pass
|
||||
|
||||
if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected and \
|
||||
self.vad_opts.detect_mode == VadDetectMode.kVadMutipleUtteranceDetectMode.value:
|
||||
self.ResetDetection()
|
||||
|
||||
@@ -0,0 +1,373 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
|
||||
import copy
|
||||
|
||||
import numpy as np
|
||||
from typeguard import check_argument_types
|
||||
import kaldi_native_fbank as knf
|
||||
|
||||
root_dir = Path(__file__).resolve().parent
|
||||
|
||||
logger_initialized = {}
|
||||
|
||||
|
||||
class WavFrontend():
|
||||
"""Conventional frontend structure for ASR.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cmvn_file: str = None,
|
||||
fs: int = 16000,
|
||||
window: str = 'hamming',
|
||||
n_mels: int = 80,
|
||||
frame_length: int = 25,
|
||||
frame_shift: int = 10,
|
||||
lfr_m: int = 1,
|
||||
lfr_n: int = 1,
|
||||
dither: float = 1.0,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
check_argument_types()
|
||||
|
||||
opts = knf.FbankOptions()
|
||||
opts.frame_opts.samp_freq = fs
|
||||
opts.frame_opts.dither = dither
|
||||
opts.frame_opts.window_type = window
|
||||
opts.frame_opts.frame_shift_ms = float(frame_shift)
|
||||
opts.frame_opts.frame_length_ms = float(frame_length)
|
||||
opts.mel_opts.num_bins = n_mels
|
||||
opts.energy_floor = 0
|
||||
opts.frame_opts.snip_edges = True
|
||||
opts.mel_opts.debug_mel = False
|
||||
self.opts = opts
|
||||
|
||||
self.lfr_m = lfr_m
|
||||
self.lfr_n = lfr_n
|
||||
self.cmvn_file = cmvn_file
|
||||
|
||||
if self.cmvn_file:
|
||||
self.cmvn = self.load_cmvn()
|
||||
self.fbank_fn = None
|
||||
self.fbank_beg_idx = 0
|
||||
self.reset_status()
|
||||
|
||||
def fbank(self,
|
||||
waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
waveform = waveform * (1 << 15)
|
||||
self.fbank_fn = knf.OnlineFbank(self.opts)
|
||||
self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
|
||||
frames = self.fbank_fn.num_frames_ready
|
||||
mat = np.empty([frames, self.opts.mel_opts.num_bins])
|
||||
for i in range(frames):
|
||||
mat[i, :] = self.fbank_fn.get_frame(i)
|
||||
feat = mat.astype(np.float32)
|
||||
feat_len = np.array(mat.shape[0]).astype(np.int32)
|
||||
return feat, feat_len
|
||||
|
||||
def fbank_online(self,
|
||||
waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
waveform = waveform * (1 << 15)
|
||||
# self.fbank_fn = knf.OnlineFbank(self.opts)
|
||||
self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
|
||||
frames = self.fbank_fn.num_frames_ready
|
||||
mat = np.empty([frames, self.opts.mel_opts.num_bins])
|
||||
for i in range(self.fbank_beg_idx, frames):
|
||||
mat[i, :] = self.fbank_fn.get_frame(i)
|
||||
# self.fbank_beg_idx += (frames-self.fbank_beg_idx)
|
||||
feat = mat.astype(np.float32)
|
||||
feat_len = np.array(mat.shape[0]).astype(np.int32)
|
||||
return feat, feat_len
|
||||
|
||||
def reset_status(self):
|
||||
self.fbank_fn = knf.OnlineFbank(self.opts)
|
||||
self.fbank_beg_idx = 0
|
||||
|
||||
def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
if self.lfr_m != 1 or self.lfr_n != 1:
|
||||
feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
|
||||
|
||||
if self.cmvn_file:
|
||||
feat = self.apply_cmvn(feat)
|
||||
|
||||
feat_len = np.array(feat.shape[0]).astype(np.int32)
|
||||
return feat, feat_len
|
||||
|
||||
@staticmethod
|
||||
def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
|
||||
LFR_inputs = []
|
||||
|
||||
T = inputs.shape[0]
|
||||
T_lfr = int(np.ceil(T / lfr_n))
|
||||
left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
|
||||
inputs = np.vstack((left_padding, inputs))
|
||||
T = T + (lfr_m - 1) // 2
|
||||
for i in range(T_lfr):
|
||||
if lfr_m <= T - i * lfr_n:
|
||||
LFR_inputs.append(
|
||||
(inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
|
||||
else:
|
||||
# process last LFR frame
|
||||
num_padding = lfr_m - (T - i * lfr_n)
|
||||
frame = inputs[i * lfr_n:].reshape(-1)
|
||||
for _ in range(num_padding):
|
||||
frame = np.hstack((frame, inputs[-1]))
|
||||
|
||||
LFR_inputs.append(frame)
|
||||
LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
|
||||
return LFR_outputs
|
||||
|
||||
def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Apply CMVN with mvn data
|
||||
"""
|
||||
frame, dim = inputs.shape
|
||||
means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
|
||||
vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
|
||||
inputs = (inputs + means) * vars
|
||||
return inputs
|
||||
|
||||
def load_cmvn(self,) -> np.ndarray:
|
||||
with open(self.cmvn_file, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
means_list = []
|
||||
vars_list = []
|
||||
for i in range(len(lines)):
|
||||
line_item = lines[i].split()
|
||||
if line_item[0] == '<AddShift>':
|
||||
line_item = lines[i + 1].split()
|
||||
if line_item[0] == '<LearnRateCoef>':
|
||||
add_shift_line = line_item[3:(len(line_item) - 1)]
|
||||
means_list = list(add_shift_line)
|
||||
continue
|
||||
elif line_item[0] == '<Rescale>':
|
||||
line_item = lines[i + 1].split()
|
||||
if line_item[0] == '<LearnRateCoef>':
|
||||
rescale_line = line_item[3:(len(line_item) - 1)]
|
||||
vars_list = list(rescale_line)
|
||||
continue
|
||||
|
||||
means = np.array(means_list).astype(np.float64)
|
||||
vars = np.array(vars_list).astype(np.float64)
|
||||
cmvn = np.array([means, vars])
|
||||
return cmvn
|
||||
|
||||
|
||||
class WavFrontendOnline(WavFrontend):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
# self.fbank_fn = knf.OnlineFbank(self.opts)
|
||||
# add variables
|
||||
self.frame_sample_length = int(self.opts.frame_opts.frame_length_ms * self.opts.frame_opts.samp_freq / 1000)
|
||||
self.frame_shift_sample_length = int(self.opts.frame_opts.frame_shift_ms * self.opts.frame_opts.samp_freq / 1000)
|
||||
self.waveform = None
|
||||
self.reserve_waveforms = None
|
||||
self.input_cache = None
|
||||
self.lfr_splice_cache = []
|
||||
|
||||
@staticmethod
|
||||
# inputs has catted the cache
|
||||
def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int, is_final: bool = False) -> Tuple[
|
||||
np.ndarray, np.ndarray, int]:
|
||||
"""
|
||||
Apply lfr with data
|
||||
"""
|
||||
|
||||
LFR_inputs = []
|
||||
T = inputs.shape[0] # include the right context
|
||||
T_lfr = int(np.ceil((T - (lfr_m - 1) // 2) / lfr_n)) # minus the right context: (lfr_m - 1) // 2
|
||||
splice_idx = T_lfr
|
||||
for i in range(T_lfr):
|
||||
if lfr_m <= T - i * lfr_n:
|
||||
LFR_inputs.append((inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
|
||||
else: # process last LFR frame
|
||||
if is_final:
|
||||
num_padding = lfr_m - (T - i * lfr_n)
|
||||
frame = (inputs[i * lfr_n:]).reshape(-1)
|
||||
for _ in range(num_padding):
|
||||
frame = np.hstack((frame, inputs[-1]))
|
||||
LFR_inputs.append(frame)
|
||||
else:
|
||||
# update splice_idx and break the circle
|
||||
splice_idx = i
|
||||
break
|
||||
splice_idx = min(T - 1, splice_idx * lfr_n)
|
||||
lfr_splice_cache = inputs[splice_idx:, :]
|
||||
LFR_outputs = np.vstack(LFR_inputs)
|
||||
return LFR_outputs.astype(np.float32), lfr_splice_cache, splice_idx
|
||||
|
||||
@staticmethod
|
||||
def compute_frame_num(sample_length: int, frame_sample_length: int, frame_shift_sample_length: int) -> int:
|
||||
frame_num = int((sample_length - frame_sample_length) / frame_shift_sample_length + 1)
|
||||
return frame_num if frame_num >= 1 and sample_length >= frame_sample_length else 0
|
||||
|
||||
|
||||
def fbank(
|
||||
self,
|
||||
input: np.ndarray,
|
||||
input_lengths: np.ndarray
|
||||
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
||||
self.fbank_fn = knf.OnlineFbank(self.opts)
|
||||
batch_size = input.shape[0]
|
||||
if self.input_cache is None:
|
||||
self.input_cache = np.empty((batch_size, 0), dtype=np.float32)
|
||||
input = np.concatenate((self.input_cache, input), axis=1)
|
||||
frame_num = self.compute_frame_num(input.shape[-1], self.frame_sample_length, self.frame_shift_sample_length)
|
||||
# update self.in_cache
|
||||
self.input_cache = input[:, -(input.shape[-1] - frame_num * self.frame_shift_sample_length):]
|
||||
waveforms = np.empty(0, dtype=np.float32)
|
||||
feats_pad = np.empty(0, dtype=np.float32)
|
||||
feats_lens = np.empty(0, dtype=np.int32)
|
||||
if frame_num:
|
||||
waveforms = []
|
||||
feats = []
|
||||
feats_lens = []
|
||||
for i in range(batch_size):
|
||||
waveform = input[i]
|
||||
waveforms.append(
|
||||
waveform[:((frame_num - 1) * self.frame_shift_sample_length + self.frame_sample_length)])
|
||||
waveform = waveform * (1 << 15)
|
||||
|
||||
self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
|
||||
frames = self.fbank_fn.num_frames_ready
|
||||
mat = np.empty([frames, self.opts.mel_opts.num_bins])
|
||||
for i in range(frames):
|
||||
mat[i, :] = self.fbank_fn.get_frame(i)
|
||||
feat = mat.astype(np.float32)
|
||||
feat_len = np.array(mat.shape[0]).astype(np.int32)
|
||||
feats.append(feat)
|
||||
feats_lens.append(feat_len)
|
||||
|
||||
waveforms = np.stack(waveforms)
|
||||
feats_lens = np.array(feats_lens)
|
||||
feats_pad = np.array(feats)
|
||||
self.fbanks = feats_pad
|
||||
self.fbanks_lens = copy.deepcopy(feats_lens)
|
||||
return waveforms, feats_pad, feats_lens
|
||||
|
||||
def get_fbank(self) -> Tuple[np.ndarray, np.ndarray]:
|
||||
return self.fbanks, self.fbanks_lens
|
||||
|
||||
def lfr_cmvn(
|
||||
self,
|
||||
input: np.ndarray,
|
||||
input_lengths: np.ndarray,
|
||||
is_final: bool = False
|
||||
) -> Tuple[np.ndarray, np.ndarray, List[int]]:
|
||||
batch_size = input.shape[0]
|
||||
feats = []
|
||||
feats_lens = []
|
||||
lfr_splice_frame_idxs = []
|
||||
for i in range(batch_size):
|
||||
mat = input[i, :input_lengths[i], :]
|
||||
lfr_splice_frame_idx = -1
|
||||
if self.lfr_m != 1 or self.lfr_n != 1:
|
||||
# update self.lfr_splice_cache in self.apply_lfr
|
||||
mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n,
|
||||
is_final)
|
||||
if self.cmvn_file is not None:
|
||||
mat = self.apply_cmvn(mat)
|
||||
feat_length = mat.shape[0]
|
||||
feats.append(mat)
|
||||
feats_lens.append(feat_length)
|
||||
lfr_splice_frame_idxs.append(lfr_splice_frame_idx)
|
||||
|
||||
feats_lens = np.array(feats_lens)
|
||||
feats_pad = np.array(feats)
|
||||
return feats_pad, feats_lens, lfr_splice_frame_idxs
|
||||
|
||||
|
||||
def extract_fbank(
|
||||
self, input: np.ndarray, input_lengths: np.ndarray, is_final: bool = False
|
||||
) -> Tuple[np.ndarray, np.ndarray]:
|
||||
batch_size = input.shape[0]
|
||||
assert batch_size == 1, 'we support to extract feature online only when the batch size is equal to 1 now'
|
||||
waveforms, feats, feats_lengths = self.fbank(input, input_lengths) # input shape: B T D
|
||||
if feats.shape[0]:
|
||||
self.waveforms = waveforms if self.reserve_waveforms is None else np.concatenate(
|
||||
(self.reserve_waveforms, waveforms), axis=1)
|
||||
if not self.lfr_splice_cache:
|
||||
for i in range(batch_size):
|
||||
self.lfr_splice_cache.append(np.expand_dims(feats[i][0, :], axis=0).repeat((self.lfr_m - 1) // 2, axis=0))
|
||||
|
||||
if feats_lengths[0] + self.lfr_splice_cache[0].shape[0] >= self.lfr_m:
|
||||
lfr_splice_cache_np = np.stack(self.lfr_splice_cache) # B T D
|
||||
feats = np.concatenate((lfr_splice_cache_np, feats), axis=1)
|
||||
feats_lengths += lfr_splice_cache_np[0].shape[0]
|
||||
frame_from_waveforms = int(
|
||||
(self.waveforms.shape[1] - self.frame_sample_length) / self.frame_shift_sample_length + 1)
|
||||
minus_frame = (self.lfr_m - 1) // 2 if self.reserve_waveforms is None else 0
|
||||
feats, feats_lengths, lfr_splice_frame_idxs = self.lfr_cmvn(feats, feats_lengths, is_final)
|
||||
if self.lfr_m == 1:
|
||||
self.reserve_waveforms = None
|
||||
else:
|
||||
reserve_frame_idx = lfr_splice_frame_idxs[0] - minus_frame
|
||||
# print('reserve_frame_idx: ' + str(reserve_frame_idx))
|
||||
# print('frame_frame: ' + str(frame_from_waveforms))
|
||||
self.reserve_waveforms = self.waveforms[:, reserve_frame_idx * self.frame_shift_sample_length:frame_from_waveforms * self.frame_shift_sample_length]
|
||||
sample_length = (frame_from_waveforms - 1) * self.frame_shift_sample_length + self.frame_sample_length
|
||||
self.waveforms = self.waveforms[:, :sample_length]
|
||||
else:
|
||||
# update self.reserve_waveforms and self.lfr_splice_cache
|
||||
self.reserve_waveforms = self.waveforms[:,
|
||||
:-(self.frame_sample_length - self.frame_shift_sample_length)]
|
||||
for i in range(batch_size):
|
||||
self.lfr_splice_cache[i] = np.concatenate((self.lfr_splice_cache[i], feats[i]), axis=0)
|
||||
return np.empty(0, dtype=np.float32), feats_lengths
|
||||
else:
|
||||
if is_final:
|
||||
self.waveforms = waveforms if self.reserve_waveforms is None else self.reserve_waveforms
|
||||
feats = np.stack(self.lfr_splice_cache)
|
||||
feats_lengths = np.zeros(batch_size, dtype=np.int32) + feats.shape[1]
|
||||
feats, feats_lengths, _ = self.lfr_cmvn(feats, feats_lengths, is_final)
|
||||
if is_final:
|
||||
self.cache_reset()
|
||||
return feats, feats_lengths
|
||||
|
||||
def get_waveforms(self):
|
||||
return self.waveforms
|
||||
|
||||
def cache_reset(self):
|
||||
self.fbank_fn = knf.OnlineFbank(self.opts)
|
||||
self.reserve_waveforms = None
|
||||
self.input_cache = None
|
||||
self.lfr_splice_cache = []
|
||||
|
||||
def load_bytes(input):
|
||||
middle_data = np.frombuffer(input, dtype=np.int16)
|
||||
middle_data = np.asarray(middle_data)
|
||||
if middle_data.dtype.kind not in 'iu':
|
||||
raise TypeError("'middle_data' must be an array of integers")
|
||||
dtype = np.dtype('float32')
|
||||
if dtype.kind != 'f':
|
||||
raise TypeError("'dtype' must be a floating point type")
|
||||
|
||||
i = np.iinfo(middle_data.dtype)
|
||||
abs_max = 2 ** (i.bits - 1)
|
||||
offset = i.min + abs_max
|
||||
array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
|
||||
return array
|
||||
|
||||
|
||||
def test():
|
||||
path = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav"
|
||||
import librosa
|
||||
cmvn_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/am.mvn"
|
||||
config_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/config.yaml"
|
||||
from funasr_local.runtime.python.onnxruntime.rapid_paraformer.utils.utils import read_yaml
|
||||
config = read_yaml(config_file)
|
||||
waveform, _ = librosa.load(path, sr=None)
|
||||
frontend = WavFrontend(
|
||||
cmvn_file=cmvn_file,
|
||||
**config['frontend_conf'],
|
||||
)
|
||||
speech, _ = frontend.fbank_online(waveform) #1d, (sample,), numpy
|
||||
feat, feat_len = frontend.lfr_cmvn(speech) # 2d, (frame, 450), np.float32 -> torch, torch.from_numpy(), dtype, (1, frame, 450)
|
||||
|
||||
frontend.reset_status() # clear cache
|
||||
return feat, feat_len
|
||||
|
||||
if __name__ == '__main__':
|
||||
test()
|
||||
@@ -0,0 +1,242 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
# MIT License (https://opensource.org/licenses/MIT)
|
||||
|
||||
import string
|
||||
import logging
|
||||
from typing import Any, List, Union
|
||||
|
||||
|
||||
def isChinese(ch: str):
|
||||
if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039':
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def isAllChinese(word: Union[List[Any], str]):
|
||||
word_lists = []
|
||||
for i in word:
|
||||
cur = i.replace(' ', '')
|
||||
cur = cur.replace('</s>', '')
|
||||
cur = cur.replace('<s>', '')
|
||||
word_lists.append(cur)
|
||||
|
||||
if len(word_lists) == 0:
|
||||
return False
|
||||
|
||||
for ch in word_lists:
|
||||
if isChinese(ch) is False:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def isAllAlpha(word: Union[List[Any], str]):
|
||||
word_lists = []
|
||||
for i in word:
|
||||
cur = i.replace(' ', '')
|
||||
cur = cur.replace('</s>', '')
|
||||
cur = cur.replace('<s>', '')
|
||||
word_lists.append(cur)
|
||||
|
||||
if len(word_lists) == 0:
|
||||
return False
|
||||
|
||||
for ch in word_lists:
|
||||
if ch.isalpha() is False and ch != "'":
|
||||
return False
|
||||
elif ch.isalpha() is True and isChinese(ch) is True:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
# def abbr_dispose(words: List[Any]) -> List[Any]:
|
||||
def abbr_dispose(words: List[Any], time_stamp: List[List] = None) -> List[Any]:
|
||||
words_size = len(words)
|
||||
word_lists = []
|
||||
abbr_begin = []
|
||||
abbr_end = []
|
||||
last_num = -1
|
||||
ts_lists = []
|
||||
ts_nums = []
|
||||
ts_index = 0
|
||||
for num in range(words_size):
|
||||
if num <= last_num:
|
||||
continue
|
||||
|
||||
if len(words[num]) == 1 and words[num].encode('utf-8').isalpha():
|
||||
if num + 1 < words_size and words[
|
||||
num + 1] == ' ' and num + 2 < words_size and len(
|
||||
words[num +
|
||||
2]) == 1 and words[num +
|
||||
2].encode('utf-8').isalpha():
|
||||
# found the begin of abbr
|
||||
abbr_begin.append(num)
|
||||
num += 2
|
||||
abbr_end.append(num)
|
||||
# to find the end of abbr
|
||||
while True:
|
||||
num += 1
|
||||
if num < words_size and words[num] == ' ':
|
||||
num += 1
|
||||
if num < words_size and len(
|
||||
words[num]) == 1 and words[num].encode(
|
||||
'utf-8').isalpha():
|
||||
abbr_end.pop()
|
||||
abbr_end.append(num)
|
||||
last_num = num
|
||||
else:
|
||||
break
|
||||
else:
|
||||
break
|
||||
|
||||
for num in range(words_size):
|
||||
if words[num] == ' ':
|
||||
ts_nums.append(ts_index)
|
||||
else:
|
||||
ts_nums.append(ts_index)
|
||||
ts_index += 1
|
||||
last_num = -1
|
||||
for num in range(words_size):
|
||||
if num <= last_num:
|
||||
continue
|
||||
|
||||
if num in abbr_begin:
|
||||
if time_stamp is not None:
|
||||
begin = time_stamp[ts_nums[num]][0]
|
||||
word_lists.append(words[num].upper())
|
||||
num += 1
|
||||
while num < words_size:
|
||||
if num in abbr_end:
|
||||
word_lists.append(words[num].upper())
|
||||
last_num = num
|
||||
break
|
||||
else:
|
||||
if words[num].encode('utf-8').isalpha():
|
||||
word_lists.append(words[num].upper())
|
||||
num += 1
|
||||
if time_stamp is not None:
|
||||
end = time_stamp[ts_nums[num]][1]
|
||||
ts_lists.append([begin, end])
|
||||
else:
|
||||
word_lists.append(words[num])
|
||||
if time_stamp is not None and words[num] != ' ':
|
||||
begin = time_stamp[ts_nums[num]][0]
|
||||
end = time_stamp[ts_nums[num]][1]
|
||||
ts_lists.append([begin, end])
|
||||
begin = end
|
||||
|
||||
if time_stamp is not None:
|
||||
return word_lists, ts_lists
|
||||
else:
|
||||
return word_lists
|
||||
|
||||
|
||||
def sentence_postprocess(words: List[Any], time_stamp: List[List] = None):
|
||||
middle_lists = []
|
||||
word_lists = []
|
||||
word_item = ''
|
||||
ts_lists = []
|
||||
|
||||
# wash words lists
|
||||
for i in words:
|
||||
word = ''
|
||||
if isinstance(i, str):
|
||||
word = i
|
||||
else:
|
||||
word = i.decode('utf-8')
|
||||
|
||||
if word in ['<s>', '</s>', '<unk>']:
|
||||
continue
|
||||
else:
|
||||
middle_lists.append(word)
|
||||
|
||||
# all chinese characters
|
||||
if isAllChinese(middle_lists):
|
||||
for i, ch in enumerate(middle_lists):
|
||||
word_lists.append(ch.replace(' ', ''))
|
||||
if time_stamp is not None:
|
||||
ts_lists = time_stamp
|
||||
|
||||
# all alpha characters
|
||||
elif isAllAlpha(middle_lists):
|
||||
ts_flag = True
|
||||
for i, ch in enumerate(middle_lists):
|
||||
if ts_flag and time_stamp is not None:
|
||||
begin = time_stamp[i][0]
|
||||
end = time_stamp[i][1]
|
||||
word = ''
|
||||
if '@@' in ch:
|
||||
word = ch.replace('@@', '')
|
||||
word_item += word
|
||||
if time_stamp is not None:
|
||||
ts_flag = False
|
||||
end = time_stamp[i][1]
|
||||
else:
|
||||
word_item += ch
|
||||
word_lists.append(word_item)
|
||||
word_lists.append(' ')
|
||||
word_item = ''
|
||||
if time_stamp is not None:
|
||||
ts_flag = True
|
||||
end = time_stamp[i][1]
|
||||
ts_lists.append([begin, end])
|
||||
begin = end
|
||||
|
||||
# mix characters
|
||||
else:
|
||||
alpha_blank = False
|
||||
ts_flag = True
|
||||
begin = -1
|
||||
end = -1
|
||||
for i, ch in enumerate(middle_lists):
|
||||
if ts_flag and time_stamp is not None:
|
||||
begin = time_stamp[i][0]
|
||||
end = time_stamp[i][1]
|
||||
word = ''
|
||||
if isAllChinese(ch):
|
||||
if alpha_blank is True:
|
||||
word_lists.pop()
|
||||
word_lists.append(ch)
|
||||
alpha_blank = False
|
||||
if time_stamp is not None:
|
||||
ts_flag = True
|
||||
ts_lists.append([begin, end])
|
||||
begin = end
|
||||
elif '@@' in ch:
|
||||
word = ch.replace('@@', '')
|
||||
word_item += word
|
||||
alpha_blank = False
|
||||
if time_stamp is not None:
|
||||
ts_flag = False
|
||||
end = time_stamp[i][1]
|
||||
elif isAllAlpha(ch):
|
||||
word_item += ch
|
||||
word_lists.append(word_item)
|
||||
word_lists.append(' ')
|
||||
word_item = ''
|
||||
alpha_blank = True
|
||||
if time_stamp is not None:
|
||||
ts_flag = True
|
||||
end = time_stamp[i][1]
|
||||
ts_lists.append([begin, end])
|
||||
begin = end
|
||||
else:
|
||||
raise ValueError('invalid character: {}'.format(ch))
|
||||
|
||||
if time_stamp is not None:
|
||||
word_lists, ts_lists = abbr_dispose(word_lists, ts_lists)
|
||||
real_word_lists = []
|
||||
for ch in word_lists:
|
||||
if ch != ' ':
|
||||
real_word_lists.append(ch)
|
||||
sentence = ' '.join(real_word_lists).strip()
|
||||
return sentence, ts_lists, real_word_lists
|
||||
else:
|
||||
word_lists = abbr_dispose(word_lists)
|
||||
real_word_lists = []
|
||||
for ch in word_lists:
|
||||
if ch != ' ':
|
||||
real_word_lists.append(ch)
|
||||
sentence = ''.join(word_lists).strip()
|
||||
return sentence, real_word_lists
|
||||
@@ -0,0 +1,63 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
# MIT License (https://opensource.org/licenses/MIT)
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def time_stamp_lfr6_onnx(us_cif_peak, char_list, begin_time=0.0, total_offset=-1.5):
|
||||
if not len(char_list):
|
||||
return []
|
||||
START_END_THRESHOLD = 5
|
||||
MAX_TOKEN_DURATION = 30
|
||||
TIME_RATE = 10.0 * 6 / 1000 / 3 # 3 times upsampled
|
||||
cif_peak = us_cif_peak.reshape(-1)
|
||||
num_frames = cif_peak.shape[-1]
|
||||
if char_list[-1] == '</s>':
|
||||
char_list = char_list[:-1]
|
||||
# char_list = [i for i in text]
|
||||
timestamp_list = []
|
||||
new_char_list = []
|
||||
# for bicif model trained with large data, cif2 actually fires when a character starts
|
||||
# so treat the frames between two peaks as the duration of the former token
|
||||
fire_place = np.where(cif_peak>1.0-1e-4)[0] + total_offset # np format
|
||||
num_peak = len(fire_place)
|
||||
assert num_peak == len(char_list) + 1 # number of peaks is supposed to be number of tokens + 1
|
||||
# begin silence
|
||||
if fire_place[0] > START_END_THRESHOLD:
|
||||
# char_list.insert(0, '<sil>')
|
||||
timestamp_list.append([0.0, fire_place[0]*TIME_RATE])
|
||||
new_char_list.append('<sil>')
|
||||
# tokens timestamp
|
||||
for i in range(len(fire_place)-1):
|
||||
new_char_list.append(char_list[i])
|
||||
if i == len(fire_place)-2 or MAX_TOKEN_DURATION < 0 or fire_place[i+1] - fire_place[i] < MAX_TOKEN_DURATION:
|
||||
timestamp_list.append([fire_place[i]*TIME_RATE, fire_place[i+1]*TIME_RATE])
|
||||
else:
|
||||
# cut the duration to token and sil of the 0-weight frames last long
|
||||
_split = fire_place[i] + MAX_TOKEN_DURATION
|
||||
timestamp_list.append([fire_place[i]*TIME_RATE, _split*TIME_RATE])
|
||||
timestamp_list.append([_split*TIME_RATE, fire_place[i+1]*TIME_RATE])
|
||||
new_char_list.append('<sil>')
|
||||
# tail token and end silence
|
||||
if num_frames - fire_place[-1] > START_END_THRESHOLD:
|
||||
_end = (num_frames + fire_place[-1]) / 2
|
||||
timestamp_list[-1][1] = _end*TIME_RATE
|
||||
timestamp_list.append([_end*TIME_RATE, num_frames*TIME_RATE])
|
||||
new_char_list.append("<sil>")
|
||||
else:
|
||||
timestamp_list[-1][1] = num_frames*TIME_RATE
|
||||
if begin_time: # add offset time in model with vad
|
||||
for i in range(len(timestamp_list)):
|
||||
timestamp_list[i][0] = timestamp_list[i][0] + begin_time / 1000.0
|
||||
timestamp_list[i][1] = timestamp_list[i][1] + begin_time / 1000.0
|
||||
assert len(new_char_list) == len(timestamp_list)
|
||||
res_str = ""
|
||||
for char, timestamp in zip(new_char_list, timestamp_list):
|
||||
res_str += "{} {} {};".format(char, timestamp[0], timestamp[1])
|
||||
res = []
|
||||
for char, timestamp in zip(new_char_list, timestamp_list):
|
||||
if char != '<sil>':
|
||||
res.append([int(timestamp[0] * 1000), int(timestamp[1] * 1000)])
|
||||
return res_str, res
|
||||
|
||||
@@ -0,0 +1,274 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
import functools
|
||||
import logging
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import yaml
|
||||
from onnxruntime import (GraphOptimizationLevel, InferenceSession,
|
||||
SessionOptions, get_available_providers, get_device)
|
||||
from typeguard import check_argument_types
|
||||
|
||||
import warnings
|
||||
|
||||
root_dir = Path(__file__).resolve().parent
|
||||
|
||||
logger_initialized = {}
|
||||
|
||||
|
||||
class TokenIDConverter():
|
||||
def __init__(self, token_list: Union[List, str],
|
||||
):
|
||||
check_argument_types()
|
||||
|
||||
self.token_list = token_list
|
||||
self.unk_symbol = token_list[-1]
|
||||
self.token2id = {v: i for i, v in enumerate(self.token_list)}
|
||||
self.unk_id = self.token2id[self.unk_symbol]
|
||||
|
||||
|
||||
def get_num_vocabulary_size(self) -> int:
|
||||
return len(self.token_list)
|
||||
|
||||
def ids2tokens(self,
|
||||
integers: Union[np.ndarray, Iterable[int]]) -> List[str]:
|
||||
if isinstance(integers, np.ndarray) and integers.ndim != 1:
|
||||
raise TokenIDConverterError(
|
||||
f"Must be 1 dim ndarray, but got {integers.ndim}")
|
||||
return [self.token_list[i] for i in integers]
|
||||
|
||||
def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
|
||||
|
||||
return [self.token2id.get(i, self.unk_id) for i in tokens]
|
||||
|
||||
|
||||
class CharTokenizer():
|
||||
def __init__(
|
||||
self,
|
||||
symbol_value: Union[Path, str, Iterable[str]] = None,
|
||||
space_symbol: str = "<space>",
|
||||
remove_non_linguistic_symbols: bool = False,
|
||||
):
|
||||
check_argument_types()
|
||||
|
||||
self.space_symbol = space_symbol
|
||||
self.non_linguistic_symbols = self.load_symbols(symbol_value)
|
||||
self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
|
||||
|
||||
@staticmethod
|
||||
def load_symbols(value: Union[Path, str, Iterable[str]] = None) -> Set:
|
||||
if value is None:
|
||||
return set()
|
||||
|
||||
if isinstance(value, Iterable[str]):
|
||||
return set(value)
|
||||
|
||||
file_path = Path(value)
|
||||
if not file_path.exists():
|
||||
logging.warning("%s doesn't exist.", file_path)
|
||||
return set()
|
||||
|
||||
with file_path.open("r", encoding="utf-8") as f:
|
||||
return set(line.rstrip() for line in f)
|
||||
|
||||
def text2tokens(self, line: Union[str, list]) -> List[str]:
|
||||
tokens = []
|
||||
while len(line) != 0:
|
||||
for w in self.non_linguistic_symbols:
|
||||
if line.startswith(w):
|
||||
if not self.remove_non_linguistic_symbols:
|
||||
tokens.append(line[: len(w)])
|
||||
line = line[len(w):]
|
||||
break
|
||||
else:
|
||||
t = line[0]
|
||||
if t == " ":
|
||||
t = "<space>"
|
||||
tokens.append(t)
|
||||
line = line[1:]
|
||||
return tokens
|
||||
|
||||
def tokens2text(self, tokens: Iterable[str]) -> str:
|
||||
tokens = [t if t != self.space_symbol else " " for t in tokens]
|
||||
return "".join(tokens)
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
f"{self.__class__.__name__}("
|
||||
f'space_symbol="{self.space_symbol}"'
|
||||
f'non_linguistic_symbols="{self.non_linguistic_symbols}"'
|
||||
f")"
|
||||
)
|
||||
|
||||
|
||||
|
||||
class Hypothesis(NamedTuple):
|
||||
"""Hypothesis data type."""
|
||||
|
||||
yseq: np.ndarray
|
||||
score: Union[float, np.ndarray] = 0
|
||||
scores: Dict[str, Union[float, np.ndarray]] = dict()
|
||||
states: Dict[str, Any] = dict()
|
||||
|
||||
def asdict(self) -> dict:
|
||||
"""Convert data to JSON-friendly dict."""
|
||||
return self._replace(
|
||||
yseq=self.yseq.tolist(),
|
||||
score=float(self.score),
|
||||
scores={k: float(v) for k, v in self.scores.items()},
|
||||
)._asdict()
|
||||
|
||||
|
||||
class TokenIDConverterError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ONNXRuntimeError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class OrtInferSession():
|
||||
def __init__(self, model_file, device_id=-1, intra_op_num_threads=4):
|
||||
device_id = str(device_id)
|
||||
sess_opt = SessionOptions()
|
||||
sess_opt.intra_op_num_threads = intra_op_num_threads
|
||||
sess_opt.log_severity_level = 4
|
||||
sess_opt.enable_cpu_mem_arena = False
|
||||
sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||
|
||||
cuda_ep = 'CUDAExecutionProvider'
|
||||
cuda_provider_options = {
|
||||
"device_id": device_id,
|
||||
"arena_extend_strategy": "kNextPowerOfTwo",
|
||||
"cudnn_conv_algo_search": "EXHAUSTIVE",
|
||||
"do_copy_in_default_stream": "true",
|
||||
}
|
||||
cpu_ep = 'CPUExecutionProvider'
|
||||
cpu_provider_options = {
|
||||
"arena_extend_strategy": "kSameAsRequested",
|
||||
}
|
||||
|
||||
EP_list = []
|
||||
if device_id != "-1" and get_device() == 'GPU' \
|
||||
and cuda_ep in get_available_providers():
|
||||
EP_list = [(cuda_ep, cuda_provider_options)]
|
||||
EP_list.append((cpu_ep, cpu_provider_options))
|
||||
|
||||
self._verify_model(model_file)
|
||||
self.session = InferenceSession(model_file,
|
||||
sess_options=sess_opt,
|
||||
providers=EP_list)
|
||||
|
||||
if device_id != "-1" and cuda_ep not in self.session.get_providers():
|
||||
warnings.warn(f'{cuda_ep} is not avaiable for current env, the inference part is automatically shifted to be executed under {cpu_ep}.\n'
|
||||
'Please ensure the installed onnxruntime-gpu version matches your cuda and cudnn version, '
|
||||
'you can check their relations from the offical web site: '
|
||||
'https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html',
|
||||
RuntimeWarning)
|
||||
|
||||
def __call__(self,
|
||||
input_content: List[Union[np.ndarray, np.ndarray]]) -> np.ndarray:
|
||||
input_dict = dict(zip(self.get_input_names(), input_content))
|
||||
try:
|
||||
return self.session.run(self.get_output_names(), input_dict)
|
||||
except Exception as e:
|
||||
raise ONNXRuntimeError('ONNXRuntime inferece failed.') from e
|
||||
|
||||
def get_input_names(self, ):
|
||||
return [v.name for v in self.session.get_inputs()]
|
||||
|
||||
def get_output_names(self,):
|
||||
return [v.name for v in self.session.get_outputs()]
|
||||
|
||||
def get_character_list(self, key: str = 'character'):
|
||||
return self.meta_dict[key].splitlines()
|
||||
|
||||
def have_key(self, key: str = 'character') -> bool:
|
||||
self.meta_dict = self.session.get_modelmeta().custom_metadata_map
|
||||
if key in self.meta_dict.keys():
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _verify_model(model_path):
|
||||
model_path = Path(model_path)
|
||||
if not model_path.exists():
|
||||
raise FileNotFoundError(f'{model_path} does not exists.')
|
||||
if not model_path.is_file():
|
||||
raise FileExistsError(f'{model_path} is not a file.')
|
||||
|
||||
def split_to_mini_sentence(words: list, word_limit: int = 20):
|
||||
assert word_limit > 1
|
||||
if len(words) <= word_limit:
|
||||
return [words]
|
||||
sentences = []
|
||||
length = len(words)
|
||||
sentence_len = length // word_limit
|
||||
for i in range(sentence_len):
|
||||
sentences.append(words[i * word_limit:(i + 1) * word_limit])
|
||||
if length % word_limit > 0:
|
||||
sentences.append(words[sentence_len * word_limit:])
|
||||
return sentences
|
||||
|
||||
def code_mix_split_words(text: str):
|
||||
words = []
|
||||
segs = text.split()
|
||||
for seg in segs:
|
||||
# There is no space in seg.
|
||||
current_word = ""
|
||||
for c in seg:
|
||||
if len(c.encode()) == 1:
|
||||
# This is an ASCII char.
|
||||
current_word += c
|
||||
else:
|
||||
# This is a Chinese char.
|
||||
if len(current_word) > 0:
|
||||
words.append(current_word)
|
||||
current_word = ""
|
||||
words.append(c)
|
||||
if len(current_word) > 0:
|
||||
words.append(current_word)
|
||||
return words
|
||||
|
||||
def read_yaml(yaml_path: Union[str, Path]) -> Dict:
|
||||
if not Path(yaml_path).exists():
|
||||
raise FileExistsError(f'The {yaml_path} does not exist.')
|
||||
|
||||
with open(str(yaml_path), 'rb') as f:
|
||||
data = yaml.load(f, Loader=yaml.Loader)
|
||||
return data
|
||||
|
||||
|
||||
@functools.lru_cache()
|
||||
def get_logger(name='funasr_local_onnx'):
|
||||
"""Initialize and get a logger by name.
|
||||
If the logger has not been initialized, this method will initialize the
|
||||
logger by adding one or two handlers, otherwise the initialized logger will
|
||||
be directly returned. During initialization, a StreamHandler will always be
|
||||
added.
|
||||
Args:
|
||||
name (str): Logger name.
|
||||
Returns:
|
||||
logging.Logger: The expected logger.
|
||||
"""
|
||||
logger = logging.getLogger(name)
|
||||
if name in logger_initialized:
|
||||
return logger
|
||||
|
||||
for logger_name in logger_initialized:
|
||||
if name.startswith(logger_name):
|
||||
return logger
|
||||
|
||||
formatter = logging.Formatter(
|
||||
'[%(asctime)s] %(name)s %(levelname)s: %(message)s',
|
||||
datefmt="%Y/%m/%d %H:%M:%S")
|
||||
|
||||
sh = logging.StreamHandler()
|
||||
sh.setFormatter(formatter)
|
||||
logger.addHandler(sh)
|
||||
logger_initialized[name] = True
|
||||
logger.propagate = False
|
||||
return logger
|
||||
282
funasr_local/runtime/python/onnxruntime/funasr_onnx/vad_bin.py
Normal file
282
funasr_local/runtime/python/onnxruntime/funasr_onnx/vad_bin.py
Normal file
@@ -0,0 +1,282 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
# MIT License (https://opensource.org/licenses/MIT)
|
||||
|
||||
import os.path
|
||||
from pathlib import Path
|
||||
from typing import List, Union, Tuple
|
||||
|
||||
import copy
|
||||
import librosa
|
||||
import numpy as np
|
||||
|
||||
from .utils.utils import (ONNXRuntimeError,
|
||||
OrtInferSession, get_logger,
|
||||
read_yaml)
|
||||
from .utils.frontend import WavFrontend, WavFrontendOnline
|
||||
from .utils.e2e_vad import E2EVadModel
|
||||
|
||||
logging = get_logger()
|
||||
|
||||
|
||||
class Fsmn_vad():
|
||||
"""
|
||||
Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
Deep-FSMN for Large Vocabulary Continuous Speech Recognition
|
||||
https://arxiv.org/abs/1803.05030
|
||||
"""
|
||||
def __init__(self, model_dir: Union[str, Path] = None,
|
||||
batch_size: int = 1,
|
||||
device_id: Union[str, int] = "-1",
|
||||
quantize: bool = False,
|
||||
intra_op_num_threads: int = 4,
|
||||
max_end_sil: int = None,
|
||||
):
|
||||
|
||||
if not Path(model_dir).exists():
|
||||
raise FileNotFoundError(f'{model_dir} does not exist.')
|
||||
|
||||
model_file = os.path.join(model_dir, 'model.onnx')
|
||||
if quantize:
|
||||
model_file = os.path.join(model_dir, 'model_quant.onnx')
|
||||
config_file = os.path.join(model_dir, 'vad.yaml')
|
||||
cmvn_file = os.path.join(model_dir, 'vad.mvn')
|
||||
config = read_yaml(config_file)
|
||||
|
||||
self.frontend = WavFrontend(
|
||||
cmvn_file=cmvn_file,
|
||||
**config['frontend_conf']
|
||||
)
|
||||
self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads)
|
||||
self.batch_size = batch_size
|
||||
self.vad_scorer = E2EVadModel(config["vad_post_conf"])
|
||||
self.max_end_sil = max_end_sil if max_end_sil is not None else config["vad_post_conf"]["max_end_silence_time"]
|
||||
self.encoder_conf = config["encoder_conf"]
|
||||
|
||||
def prepare_cache(self, in_cache: list = []):
|
||||
if len(in_cache) > 0:
|
||||
return in_cache
|
||||
fsmn_layers = self.encoder_conf["fsmn_layers"]
|
||||
proj_dim = self.encoder_conf["proj_dim"]
|
||||
lorder = self.encoder_conf["lorder"]
|
||||
for i in range(fsmn_layers):
|
||||
cache = np.zeros((1, proj_dim, lorder-1, 1)).astype(np.float32)
|
||||
in_cache.append(cache)
|
||||
return in_cache
|
||||
|
||||
|
||||
def __call__(self, audio_in: Union[str, np.ndarray, List[str]], **kwargs) -> List:
|
||||
waveform_list = self.load_data(audio_in, self.frontend.opts.frame_opts.samp_freq)
|
||||
waveform_nums = len(waveform_list)
|
||||
is_final = kwargs.get('kwargs', False)
|
||||
|
||||
segments = [[]] * self.batch_size
|
||||
for beg_idx in range(0, waveform_nums, self.batch_size):
|
||||
|
||||
end_idx = min(waveform_nums, beg_idx + self.batch_size)
|
||||
waveform = waveform_list[beg_idx:end_idx]
|
||||
feats, feats_len = self.extract_feat(waveform)
|
||||
waveform = np.array(waveform)
|
||||
param_dict = kwargs.get('param_dict', dict())
|
||||
in_cache = param_dict.get('in_cache', list())
|
||||
in_cache = self.prepare_cache(in_cache)
|
||||
try:
|
||||
t_offset = 0
|
||||
step = int(min(feats_len.max(), 6000))
|
||||
for t_offset in range(0, int(feats_len), min(step, feats_len - t_offset)):
|
||||
if t_offset + step >= feats_len - 1:
|
||||
step = feats_len - t_offset
|
||||
is_final = True
|
||||
else:
|
||||
is_final = False
|
||||
feats_package = feats[:, t_offset:int(t_offset + step), :]
|
||||
waveform_package = waveform[:, t_offset * 160:min(waveform.shape[-1], (int(t_offset + step) - 1) * 160 + 400)]
|
||||
|
||||
inputs = [feats_package]
|
||||
# inputs = [feats]
|
||||
inputs.extend(in_cache)
|
||||
scores, out_caches = self.infer(inputs)
|
||||
in_cache = out_caches
|
||||
segments_part = self.vad_scorer(scores, waveform_package, is_final=is_final, max_end_sil=self.max_end_sil, online=False)
|
||||
# segments = self.vad_scorer(scores, waveform[0][None, :], is_final=is_final, max_end_sil=self.max_end_sil)
|
||||
|
||||
if segments_part:
|
||||
for batch_num in range(0, self.batch_size):
|
||||
segments[batch_num] += segments_part[batch_num]
|
||||
|
||||
except ONNXRuntimeError:
|
||||
# logging.warning(traceback.format_exc())
|
||||
logging.warning("input wav is silence or noise")
|
||||
segments = ''
|
||||
|
||||
return segments
|
||||
|
||||
def load_data(self,
|
||||
wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
|
||||
def load_wav(path: str) -> np.ndarray:
|
||||
waveform, _ = librosa.load(path, sr=fs)
|
||||
return waveform
|
||||
|
||||
if isinstance(wav_content, np.ndarray):
|
||||
return [wav_content]
|
||||
|
||||
if isinstance(wav_content, str):
|
||||
return [load_wav(wav_content)]
|
||||
|
||||
if isinstance(wav_content, list):
|
||||
return [load_wav(path) for path in wav_content]
|
||||
|
||||
raise TypeError(
|
||||
f'The type of {wav_content} is not in [str, np.ndarray, list]')
|
||||
|
||||
def extract_feat(self,
|
||||
waveform_list: List[np.ndarray]
|
||||
) -> Tuple[np.ndarray, np.ndarray]:
|
||||
feats, feats_len = [], []
|
||||
for waveform in waveform_list:
|
||||
speech, _ = self.frontend.fbank(waveform)
|
||||
feat, feat_len = self.frontend.lfr_cmvn(speech)
|
||||
feats.append(feat)
|
||||
feats_len.append(feat_len)
|
||||
|
||||
feats = self.pad_feats(feats, np.max(feats_len))
|
||||
feats_len = np.array(feats_len).astype(np.int32)
|
||||
return feats, feats_len
|
||||
|
||||
@staticmethod
|
||||
def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
|
||||
def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
|
||||
pad_width = ((0, max_feat_len - cur_len), (0, 0))
|
||||
return np.pad(feat, pad_width, 'constant', constant_values=0)
|
||||
|
||||
feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
|
||||
feats = np.array(feat_res).astype(np.float32)
|
||||
return feats
|
||||
|
||||
def infer(self, feats: List) -> Tuple[np.ndarray, np.ndarray]:
|
||||
|
||||
outputs = self.ort_infer(feats)
|
||||
scores, out_caches = outputs[0], outputs[1:]
|
||||
return scores, out_caches
|
||||
|
||||
|
||||
class Fsmn_vad_online():
|
||||
"""
|
||||
Author: Speech Lab of DAMO Academy, Alibaba Group
|
||||
Deep-FSMN for Large Vocabulary Continuous Speech Recognition
|
||||
https://arxiv.org/abs/1803.05030
|
||||
"""
|
||||
def __init__(self, model_dir: Union[str, Path] = None,
|
||||
batch_size: int = 1,
|
||||
device_id: Union[str, int] = "-1",
|
||||
quantize: bool = False,
|
||||
intra_op_num_threads: int = 4,
|
||||
max_end_sil: int = None,
|
||||
):
|
||||
|
||||
if not Path(model_dir).exists():
|
||||
raise FileNotFoundError(f'{model_dir} does not exist.')
|
||||
|
||||
model_file = os.path.join(model_dir, 'model.onnx')
|
||||
if quantize:
|
||||
model_file = os.path.join(model_dir, 'model_quant.onnx')
|
||||
config_file = os.path.join(model_dir, 'vad.yaml')
|
||||
cmvn_file = os.path.join(model_dir, 'vad.mvn')
|
||||
config = read_yaml(config_file)
|
||||
|
||||
self.frontend = WavFrontendOnline(
|
||||
cmvn_file=cmvn_file,
|
||||
**config['frontend_conf']
|
||||
)
|
||||
self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads)
|
||||
self.batch_size = batch_size
|
||||
self.vad_scorer = E2EVadModel(config["vad_post_conf"])
|
||||
self.max_end_sil = max_end_sil if max_end_sil is not None else config["vad_post_conf"]["max_end_silence_time"]
|
||||
self.encoder_conf = config["encoder_conf"]
|
||||
|
||||
def prepare_cache(self, in_cache: list = []):
|
||||
if len(in_cache) > 0:
|
||||
return in_cache
|
||||
fsmn_layers = self.encoder_conf["fsmn_layers"]
|
||||
proj_dim = self.encoder_conf["proj_dim"]
|
||||
lorder = self.encoder_conf["lorder"]
|
||||
for i in range(fsmn_layers):
|
||||
cache = np.zeros((1, proj_dim, lorder - 1, 1)).astype(np.float32)
|
||||
in_cache.append(cache)
|
||||
return in_cache
|
||||
|
||||
def __call__(self, audio_in: np.ndarray, **kwargs) -> List:
|
||||
waveforms = np.expand_dims(audio_in, axis=0)
|
||||
|
||||
param_dict = kwargs.get('param_dict', dict())
|
||||
is_final = param_dict.get('is_final', False)
|
||||
feats, feats_len = self.extract_feat(waveforms, is_final)
|
||||
segments = []
|
||||
if feats.size != 0:
|
||||
in_cache = param_dict.get('in_cache', list())
|
||||
in_cache = self.prepare_cache(in_cache)
|
||||
try:
|
||||
inputs = [feats]
|
||||
inputs.extend(in_cache)
|
||||
scores, out_caches = self.infer(inputs)
|
||||
param_dict['in_cache'] = out_caches
|
||||
waveforms = self.frontend.get_waveforms()
|
||||
segments = self.vad_scorer(scores, waveforms, is_final=is_final, max_end_sil=self.max_end_sil,
|
||||
online=True)
|
||||
|
||||
|
||||
except ONNXRuntimeError:
|
||||
# logging.warning(traceback.format_exc())
|
||||
logging.warning("input wav is silence or noise")
|
||||
segments = []
|
||||
return segments
|
||||
|
||||
def load_data(self,
|
||||
wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
|
||||
def load_wav(path: str) -> np.ndarray:
|
||||
waveform, _ = librosa.load(path, sr=fs)
|
||||
return waveform
|
||||
|
||||
if isinstance(wav_content, np.ndarray):
|
||||
return [wav_content]
|
||||
|
||||
if isinstance(wav_content, str):
|
||||
return [load_wav(wav_content)]
|
||||
|
||||
if isinstance(wav_content, list):
|
||||
return [load_wav(path) for path in wav_content]
|
||||
|
||||
raise TypeError(
|
||||
f'The type of {wav_content} is not in [str, np.ndarray, list]')
|
||||
|
||||
def extract_feat(self,
|
||||
waveforms: np.ndarray, is_final: bool = False
|
||||
) -> Tuple[np.ndarray, np.ndarray]:
|
||||
waveforms_lens = np.zeros(waveforms.shape[0]).astype(np.int32)
|
||||
for idx, waveform in enumerate(waveforms):
|
||||
waveforms_lens[idx] = waveform.shape[-1]
|
||||
|
||||
feats, feats_len = self.frontend.extract_fbank(waveforms, waveforms_lens, is_final)
|
||||
# feats.append(feat)
|
||||
# feats_len.append(feat_len)
|
||||
|
||||
# feats = self.pad_feats(feats, np.max(feats_len))
|
||||
# feats_len = np.array(feats_len).astype(np.int32)
|
||||
return feats.astype(np.float32), feats_len.astype(np.int32)
|
||||
|
||||
@staticmethod
|
||||
def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
|
||||
def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
|
||||
pad_width = ((0, max_feat_len - cur_len), (0, 0))
|
||||
return np.pad(feat, pad_width, 'constant', constant_values=0)
|
||||
|
||||
feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
|
||||
feats = np.array(feat_res).astype(np.float32)
|
||||
return feats
|
||||
|
||||
def infer(self, feats: List) -> Tuple[np.ndarray, np.ndarray]:
|
||||
|
||||
outputs = self.ort_infer(feats)
|
||||
scores, out_caches = outputs[0], outputs[1:]
|
||||
return scores, out_caches
|
||||
|
||||
45
funasr_local/runtime/python/onnxruntime/setup.py
Normal file
45
funasr_local/runtime/python/onnxruntime/setup.py
Normal file
@@ -0,0 +1,45 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
from pathlib import Path
|
||||
import setuptools
|
||||
|
||||
|
||||
def get_readme():
|
||||
root_dir = Path(__file__).resolve().parent
|
||||
readme_path = str(root_dir / 'README.md')
|
||||
print(readme_path)
|
||||
with open(readme_path, 'r', encoding='utf-8') as f:
|
||||
readme = f.read()
|
||||
return readme
|
||||
|
||||
|
||||
MODULE_NAME = 'funasr_local_onnx'
|
||||
VERSION_NUM = '0.0.6'
|
||||
|
||||
setuptools.setup(
|
||||
name=MODULE_NAME,
|
||||
version=VERSION_NUM,
|
||||
platforms="Any",
|
||||
url="https://github.com/alibaba-damo-academy/FunASR.git",
|
||||
author="Speech Lab of DAMO Academy, Alibaba Group",
|
||||
author_email="funasr_local@list.alibaba-inc.com",
|
||||
description="FunASR: A Fundamental End-to-End Speech Recognition Toolkit",
|
||||
license='MIT',
|
||||
long_description=get_readme(),
|
||||
long_description_content_type='text/markdown',
|
||||
include_package_data=True,
|
||||
install_requires=["librosa", "onnxruntime>=1.7.0",
|
||||
"scipy", "numpy>=1.19.3",
|
||||
"typeguard", "kaldi-native-fbank",
|
||||
"PyYAML>=5.1.2"],
|
||||
packages=[MODULE_NAME, f'{MODULE_NAME}.utils'],
|
||||
keywords=[
|
||||
'funasr_local,asr'
|
||||
],
|
||||
classifiers=[
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: 3.9',
|
||||
'Programming Language :: Python :: 3.10',
|
||||
],
|
||||
)
|
||||
157
funasr_local/runtime/python/utils/compute_wer.py
Normal file
157
funasr_local/runtime/python/utils/compute_wer.py
Normal file
@@ -0,0 +1,157 @@
|
||||
import os
|
||||
import numpy as np
|
||||
import sys
|
||||
|
||||
def compute_wer(ref_file,
|
||||
hyp_file,
|
||||
cer_detail_file):
|
||||
rst = {
|
||||
'Wrd': 0,
|
||||
'Corr': 0,
|
||||
'Ins': 0,
|
||||
'Del': 0,
|
||||
'Sub': 0,
|
||||
'Snt': 0,
|
||||
'Err': 0.0,
|
||||
'S.Err': 0.0,
|
||||
'wrong_words': 0,
|
||||
'wrong_sentences': 0
|
||||
}
|
||||
|
||||
hyp_dict = {}
|
||||
ref_dict = {}
|
||||
with open(hyp_file, 'r') as hyp_reader:
|
||||
for line in hyp_reader:
|
||||
key = line.strip().split()[0]
|
||||
value = line.strip().split()[1:]
|
||||
hyp_dict[key] = value
|
||||
with open(ref_file, 'r') as ref_reader:
|
||||
for line in ref_reader:
|
||||
key = line.strip().split()[0]
|
||||
value = line.strip().split()[1:]
|
||||
ref_dict[key] = value
|
||||
|
||||
cer_detail_writer = open(cer_detail_file, 'w')
|
||||
for hyp_key in hyp_dict:
|
||||
if hyp_key in ref_dict:
|
||||
out_item = compute_wer_by_line(hyp_dict[hyp_key], ref_dict[hyp_key])
|
||||
rst['Wrd'] += out_item['nwords']
|
||||
rst['Corr'] += out_item['cor']
|
||||
rst['wrong_words'] += out_item['wrong']
|
||||
rst['Ins'] += out_item['ins']
|
||||
rst['Del'] += out_item['del']
|
||||
rst['Sub'] += out_item['sub']
|
||||
rst['Snt'] += 1
|
||||
if out_item['wrong'] > 0:
|
||||
rst['wrong_sentences'] += 1
|
||||
cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
|
||||
cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
|
||||
cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
|
||||
|
||||
if rst['Wrd'] > 0:
|
||||
rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)
|
||||
if rst['Snt'] > 0:
|
||||
rst['S.Err'] = round(rst['wrong_sentences'] * 100 / rst['Snt'], 2)
|
||||
|
||||
cer_detail_writer.write('\n')
|
||||
cer_detail_writer.write("%WER " + str(rst['Err']) + " [ " + str(rst['wrong_words'])+ " / " + str(rst['Wrd']) +
|
||||
", " + str(rst['Ins']) + " ins, " + str(rst['Del']) + " del, " + str(rst['Sub']) + " sub ]" + '\n')
|
||||
cer_detail_writer.write("%SER " + str(rst['S.Err']) + " [ " + str(rst['wrong_sentences']) + " / " + str(rst['Snt']) + " ]" + '\n')
|
||||
cer_detail_writer.write("Scored " + str(len(hyp_dict)) + " sentences, " + str(len(hyp_dict) - rst['Snt']) + " not present in hyp." + '\n')
|
||||
|
||||
|
||||
def compute_wer_by_line(hyp,
|
||||
ref):
|
||||
hyp = list(map(lambda x: x.lower(), hyp))
|
||||
ref = list(map(lambda x: x.lower(), ref))
|
||||
|
||||
len_hyp = len(hyp)
|
||||
len_ref = len(ref)
|
||||
|
||||
cost_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int16)
|
||||
|
||||
ops_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int8)
|
||||
|
||||
for i in range(len_hyp + 1):
|
||||
cost_matrix[i][0] = i
|
||||
for j in range(len_ref + 1):
|
||||
cost_matrix[0][j] = j
|
||||
|
||||
for i in range(1, len_hyp + 1):
|
||||
for j in range(1, len_ref + 1):
|
||||
if hyp[i - 1] == ref[j - 1]:
|
||||
cost_matrix[i][j] = cost_matrix[i - 1][j - 1]
|
||||
else:
|
||||
substitution = cost_matrix[i - 1][j - 1] + 1
|
||||
insertion = cost_matrix[i - 1][j] + 1
|
||||
deletion = cost_matrix[i][j - 1] + 1
|
||||
|
||||
compare_val = [substitution, insertion, deletion]
|
||||
|
||||
min_val = min(compare_val)
|
||||
operation_idx = compare_val.index(min_val) + 1
|
||||
cost_matrix[i][j] = min_val
|
||||
ops_matrix[i][j] = operation_idx
|
||||
|
||||
match_idx = []
|
||||
i = len_hyp
|
||||
j = len_ref
|
||||
rst = {
|
||||
'nwords': len_ref,
|
||||
'cor': 0,
|
||||
'wrong': 0,
|
||||
'ins': 0,
|
||||
'del': 0,
|
||||
'sub': 0
|
||||
}
|
||||
while i >= 0 or j >= 0:
|
||||
i_idx = max(0, i)
|
||||
j_idx = max(0, j)
|
||||
|
||||
if ops_matrix[i_idx][j_idx] == 0: # correct
|
||||
if i - 1 >= 0 and j - 1 >= 0:
|
||||
match_idx.append((j - 1, i - 1))
|
||||
rst['cor'] += 1
|
||||
|
||||
i -= 1
|
||||
j -= 1
|
||||
|
||||
elif ops_matrix[i_idx][j_idx] == 2: # insert
|
||||
i -= 1
|
||||
rst['ins'] += 1
|
||||
|
||||
elif ops_matrix[i_idx][j_idx] == 3: # delete
|
||||
j -= 1
|
||||
rst['del'] += 1
|
||||
|
||||
elif ops_matrix[i_idx][j_idx] == 1: # substitute
|
||||
i -= 1
|
||||
j -= 1
|
||||
rst['sub'] += 1
|
||||
|
||||
if i < 0 and j >= 0:
|
||||
rst['del'] += 1
|
||||
elif j < 0 and i >= 0:
|
||||
rst['ins'] += 1
|
||||
|
||||
match_idx.reverse()
|
||||
wrong_cnt = cost_matrix[len_hyp][len_ref]
|
||||
rst['wrong'] = wrong_cnt
|
||||
|
||||
return rst
|
||||
|
||||
def print_cer_detail(rst):
|
||||
return ("(" + "nwords=" + str(rst['nwords']) + ",cor=" + str(rst['cor'])
|
||||
+ ",ins=" + str(rst['ins']) + ",del=" + str(rst['del']) + ",sub="
|
||||
+ str(rst['sub']) + ") corr:" + '{:.2%}'.format(rst['cor']/rst['nwords'])
|
||||
+ ",cer:" + '{:.2%}'.format(rst['wrong']/rst['nwords']))
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) != 4:
|
||||
print("usage : python compute-wer.py test.ref test.hyp test.wer")
|
||||
sys.exit(0)
|
||||
|
||||
ref_file = sys.argv[1]
|
||||
hyp_file = sys.argv[2]
|
||||
cer_detail_file = sys.argv[3]
|
||||
compute_wer(ref_file, hyp_file, cer_detail_file)
|
||||
31
funasr_local/runtime/python/utils/proce_text.py
Normal file
31
funasr_local/runtime/python/utils/proce_text.py
Normal file
@@ -0,0 +1,31 @@
|
||||
|
||||
import sys
|
||||
import re
|
||||
|
||||
in_f = sys.argv[1]
|
||||
out_f = sys.argv[2]
|
||||
|
||||
|
||||
with open(in_f, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
|
||||
with open(out_f, "w", encoding="utf-8") as f:
|
||||
for line in lines:
|
||||
outs = line.strip().split(" ", 1)
|
||||
if len(outs) == 2:
|
||||
idx, text = outs
|
||||
text = re.sub("</s>", "", text)
|
||||
text = re.sub("<s>", "", text)
|
||||
text = re.sub("@@", "", text)
|
||||
text = re.sub("@", "", text)
|
||||
text = re.sub("<unk>", "", text)
|
||||
text = re.sub(" ", "", text)
|
||||
text = text.lower()
|
||||
else:
|
||||
idx = outs[0]
|
||||
text = " "
|
||||
|
||||
text = [x for x in text]
|
||||
text = " ".join(text)
|
||||
out = "{} {}\n".format(idx, text)
|
||||
f.write(out)
|
||||
5
funasr_local/runtime/python/utils/requirements.txt
Normal file
5
funasr_local/runtime/python/utils/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
onnx
|
||||
onnxruntime
|
||||
torch-quant >= 0.4.0
|
||||
funasr_torch
|
||||
funasr_onnx
|
||||
246
funasr_local/runtime/python/utils/split_scp.pl
Normal file
246
funasr_local/runtime/python/utils/split_scp.pl
Normal file
@@ -0,0 +1,246 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# Copyright 2010-2011 Microsoft Corporation
|
||||
|
||||
# See ../../COPYING for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
# See the Apache 2 License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
# This program splits up any kind of .scp or archive-type file.
|
||||
# If there is no utt2spk option it will work on any text file and
|
||||
# will split it up with an approximately equal number of lines in
|
||||
# each but.
|
||||
# With the --utt2spk option it will work on anything that has the
|
||||
# utterance-id as the first entry on each line; the utt2spk file is
|
||||
# of the form "utterance speaker" (on each line).
|
||||
# It splits it into equal size chunks as far as it can. If you use the utt2spk
|
||||
# option it will make sure these chunks coincide with speaker boundaries. In
|
||||
# this case, if there are more chunks than speakers (and in some other
|
||||
# circumstances), some of the resulting chunks will be empty and it will print
|
||||
# an error message and exit with nonzero status.
|
||||
# You will normally call this like:
|
||||
# split_scp.pl scp scp.1 scp.2 scp.3 ...
|
||||
# or
|
||||
# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
|
||||
# Note that you can use this script to split the utt2spk file itself,
|
||||
# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
|
||||
|
||||
# You can also call the scripts like:
|
||||
# split_scp.pl -j 3 0 scp scp.0
|
||||
# [note: with this option, it assumes zero-based indexing of the split parts,
|
||||
# i.e. the second number must be 0 <= n < num-jobs.]
|
||||
|
||||
use warnings;
|
||||
|
||||
$num_jobs = 0;
|
||||
$job_id = 0;
|
||||
$utt2spk_file = "";
|
||||
$one_based = 0;
|
||||
|
||||
for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
|
||||
if ($ARGV[0] eq "-j") {
|
||||
shift @ARGV;
|
||||
$num_jobs = shift @ARGV;
|
||||
$job_id = shift @ARGV;
|
||||
}
|
||||
if ($ARGV[0] =~ /--utt2spk=(.+)/) {
|
||||
$utt2spk_file=$1;
|
||||
shift;
|
||||
}
|
||||
if ($ARGV[0] eq '--one-based') {
|
||||
$one_based = 1;
|
||||
shift @ARGV;
|
||||
}
|
||||
}
|
||||
|
||||
if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
|
||||
$job_id - $one_based >= $num_jobs)) {
|
||||
die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
|
||||
($one_based ? " --one-based" : "") . "'\n"
|
||||
}
|
||||
|
||||
$one_based
|
||||
and $job_id--;
|
||||
|
||||
if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
|
||||
die
|
||||
"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
|
||||
or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
|
||||
... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
|
||||
}
|
||||
|
||||
$error = 0;
|
||||
$inscp = shift @ARGV;
|
||||
if ($num_jobs == 0) { # without -j option
|
||||
@OUTPUTS = @ARGV;
|
||||
} else {
|
||||
for ($j = 0; $j < $num_jobs; $j++) {
|
||||
if ($j == $job_id) {
|
||||
if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
|
||||
else { push @OUTPUTS, "-"; }
|
||||
} else {
|
||||
push @OUTPUTS, "/dev/null";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($utt2spk_file ne "") { # We have the --utt2spk option...
|
||||
open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
|
||||
while(<$u_fh>) {
|
||||
@A = split;
|
||||
@A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
|
||||
($u,$s) = @A;
|
||||
$utt2spk{$u} = $s;
|
||||
}
|
||||
close $u_fh;
|
||||
open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
|
||||
@spkrs = ();
|
||||
while(<$i_fh>) {
|
||||
@A = split;
|
||||
if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
|
||||
$u = $A[0];
|
||||
$s = $utt2spk{$u};
|
||||
defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
|
||||
if(!defined $spk_count{$s}) {
|
||||
push @spkrs, $s;
|
||||
$spk_count{$s} = 0;
|
||||
$spk_data{$s} = []; # ref to new empty array.
|
||||
}
|
||||
$spk_count{$s}++;
|
||||
push @{$spk_data{$s}}, $_;
|
||||
}
|
||||
# Now split as equally as possible ..
|
||||
# First allocate spks to files by allocating an approximately
|
||||
# equal number of speakers.
|
||||
$numspks = @spkrs; # number of speakers.
|
||||
$numscps = @OUTPUTS; # number of output files.
|
||||
if ($numspks < $numscps) {
|
||||
die "$0: Refusing to split data because number of speakers $numspks " .
|
||||
"is less than the number of output .scp files $numscps\n";
|
||||
}
|
||||
for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
|
||||
$scparray[$scpidx] = []; # [] is array reference.
|
||||
}
|
||||
for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
|
||||
$scpidx = int(($spkidx*$numscps) / $numspks);
|
||||
$spk = $spkrs[$spkidx];
|
||||
push @{$scparray[$scpidx]}, $spk;
|
||||
$scpcount[$scpidx] += $spk_count{$spk};
|
||||
}
|
||||
|
||||
# Now will try to reassign beginning + ending speakers
|
||||
# to different scp's and see if it gets more balanced.
|
||||
# Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
|
||||
# We can show that if considering changing just 2 scp's, we minimize
|
||||
# this by minimizing the squared difference in sizes. This is
|
||||
# equivalent to minimizing the absolute difference in sizes. This
|
||||
# shows this method is bound to converge.
|
||||
|
||||
$changed = 1;
|
||||
while($changed) {
|
||||
$changed = 0;
|
||||
for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
|
||||
# First try to reassign ending spk of this scp.
|
||||
if($scpidx < $numscps-1) {
|
||||
$sz = @{$scparray[$scpidx]};
|
||||
if($sz > 0) {
|
||||
$spk = $scparray[$scpidx]->[$sz-1];
|
||||
$count = $spk_count{$spk};
|
||||
$nutt1 = $scpcount[$scpidx];
|
||||
$nutt2 = $scpcount[$scpidx+1];
|
||||
if( abs( ($nutt2+$count) - ($nutt1-$count))
|
||||
< abs($nutt2 - $nutt1)) { # Would decrease
|
||||
# size-diff by reassigning spk...
|
||||
$scpcount[$scpidx+1] += $count;
|
||||
$scpcount[$scpidx] -= $count;
|
||||
pop @{$scparray[$scpidx]};
|
||||
unshift @{$scparray[$scpidx+1]}, $spk;
|
||||
$changed = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
|
||||
$spk = $scparray[$scpidx]->[0];
|
||||
$count = $spk_count{$spk};
|
||||
$nutt1 = $scpcount[$scpidx-1];
|
||||
$nutt2 = $scpcount[$scpidx];
|
||||
if( abs( ($nutt2-$count) - ($nutt1+$count))
|
||||
< abs($nutt2 - $nutt1)) { # Would decrease
|
||||
# size-diff by reassigning spk...
|
||||
$scpcount[$scpidx-1] += $count;
|
||||
$scpcount[$scpidx] -= $count;
|
||||
shift @{$scparray[$scpidx]};
|
||||
push @{$scparray[$scpidx-1]}, $spk;
|
||||
$changed = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
# Now print out the files...
|
||||
for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
|
||||
$scpfile = $OUTPUTS[$scpidx];
|
||||
($scpfile ne '-' ? open($f_fh, '>', $scpfile)
|
||||
: open($f_fh, '>&', \*STDOUT)) ||
|
||||
die "$0: Could not open scp file $scpfile for writing: $!\n";
|
||||
$count = 0;
|
||||
if(@{$scparray[$scpidx]} == 0) {
|
||||
print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
|
||||
"$scpfile (too many splits and too few speakers?)\n";
|
||||
$error = 1;
|
||||
} else {
|
||||
foreach $spk ( @{$scparray[$scpidx]} ) {
|
||||
print $f_fh @{$spk_data{$spk}};
|
||||
$count += $spk_count{$spk};
|
||||
}
|
||||
$count == $scpcount[$scpidx] || die "Count mismatch [code error]";
|
||||
}
|
||||
close($f_fh);
|
||||
}
|
||||
} else {
|
||||
# This block is the "normal" case where there is no --utt2spk
|
||||
# option and we just break into equal size chunks.
|
||||
|
||||
open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
|
||||
|
||||
$numscps = @OUTPUTS; # size of array.
|
||||
@F = ();
|
||||
while(<$i_fh>) {
|
||||
push @F, $_;
|
||||
}
|
||||
$numlines = @F;
|
||||
if($numlines == 0) {
|
||||
print STDERR "$0: error: empty input scp file $inscp\n";
|
||||
$error = 1;
|
||||
}
|
||||
$linesperscp = int( $numlines / $numscps); # the "whole part"..
|
||||
$linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj ($numscps) to be smaller than the number of lines ($numlines) in $inscp]\n";
|
||||
$remainder = $numlines - ($linesperscp * $numscps);
|
||||
($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
|
||||
# [just doing int() rounds down].
|
||||
$n = 0;
|
||||
for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
|
||||
$scpfile = $OUTPUTS[$scpidx];
|
||||
($scpfile ne '-' ? open($o_fh, '>', $scpfile)
|
||||
: open($o_fh, '>&', \*STDOUT)) ||
|
||||
die "$0: Could not open scp file $scpfile for writing: $!\n";
|
||||
for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
|
||||
print $o_fh $F[$n++];
|
||||
}
|
||||
close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n";
|
||||
}
|
||||
$n == $numlines || die "$n != $numlines [code error]";
|
||||
}
|
||||
|
||||
exit ($error);
|
||||
48
funasr_local/runtime/python/utils/test_cer.py
Normal file
48
funasr_local/runtime/python/utils/test_cer.py
Normal file
@@ -0,0 +1,48 @@
|
||||
import os
|
||||
import time
|
||||
import sys
|
||||
import librosa
|
||||
from funasr_local.utils.types import str2bool
|
||||
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model_dir', type=str, required=True)
|
||||
parser.add_argument('--backend', type=str, default='onnx', help='["onnx", "torch"]')
|
||||
parser.add_argument('--wav_file', type=str, default=None, help='amp fallback number')
|
||||
parser.add_argument('--quantize', type=str2bool, default=False, help='quantized model')
|
||||
parser.add_argument('--intra_op_num_threads', type=int, default=1, help='intra_op_num_threads for onnx')
|
||||
parser.add_argument('--output_dir', type=str, default=None, help='amp fallback number')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
from funasr_local.runtime.python.libtorch.funasr_local_torch import Paraformer
|
||||
if args.backend == "onnx":
|
||||
from funasr_local.runtime.python.onnxruntime.funasr_local_onnx import Paraformer
|
||||
|
||||
model = Paraformer(args.model_dir, batch_size=1, quantize=args.quantize, intra_op_num_threads=args.intra_op_num_threads)
|
||||
|
||||
wav_file_f = open(args.wav_file, 'r')
|
||||
wav_files = wav_file_f.readlines()
|
||||
|
||||
output_dir = args.output_dir
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
if os.name == 'nt': # Windows
|
||||
newline = '\r\n'
|
||||
else: # Linux Mac
|
||||
newline = '\n'
|
||||
text_f = open(os.path.join(output_dir, "text"), "w", newline=newline)
|
||||
token_f = open(os.path.join(output_dir, "token"), "w", newline=newline)
|
||||
|
||||
for i, wav_path_i in enumerate(wav_files):
|
||||
wav_name, wav_path = wav_path_i.strip().split()
|
||||
result = model(wav_path)
|
||||
text_i = "{} {}\n".format(wav_name, result[0]['preds'][0])
|
||||
token_i = "{} {}\n".format(wav_name, result[0]['preds'][1])
|
||||
text_f.write(text_i)
|
||||
text_f.flush()
|
||||
token_f.write(token_i)
|
||||
token_f.flush()
|
||||
text_f.close()
|
||||
token_f.close()
|
||||
|
||||
74
funasr_local/runtime/python/utils/test_cer.sh
Normal file
74
funasr_local/runtime/python/utils/test_cer.sh
Normal file
@@ -0,0 +1,74 @@
|
||||
|
||||
split_scps_tool=split_scp.pl
|
||||
inference_tool=test_cer.py
|
||||
proce_text_tool=proce_text.py
|
||||
compute_wer_tool=compute_wer.py
|
||||
|
||||
nj=32
|
||||
stage=0
|
||||
stop_stage=2
|
||||
|
||||
scp="/nfs/haoneng.lhn/funasr_data/aishell-1/data/test/wav.scp"
|
||||
label_text="/nfs/haoneng.lhn/funasr_data/aishell-1/data/test/text"
|
||||
export_root="/nfs/zhifu.gzf/export"
|
||||
|
||||
|
||||
#:<<!
|
||||
model_name="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||
backend="onnx" # "torch"
|
||||
quantize='true' # 'False'
|
||||
fallback_op_num_torch=20
|
||||
tag=${model_name}/${backend}_quantize_${quantize}_${fallback_op_num_torch}
|
||||
!
|
||||
|
||||
output_dir=${export_root}/logs/${tag}/split$nj
|
||||
mkdir -p ${output_dir}
|
||||
echo ${output_dir}
|
||||
|
||||
|
||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
|
||||
|
||||
python -m funasr.export.export_model --model-name ${model_name} --export-dir ${export_root} --type ${backend} --quantize ${quantize} --audio_in ${scp} --fallback-num ${fallback_op_num_torch}
|
||||
|
||||
fi
|
||||
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
|
||||
|
||||
model_dir=${export_root}/${model_name}
|
||||
split_scps=""
|
||||
for JOB in $(seq ${nj}); do
|
||||
split_scps="$split_scps $output_dir/wav.$JOB.scp"
|
||||
done
|
||||
|
||||
perl ${split_scps_tool} $scp ${split_scps}
|
||||
|
||||
|
||||
for JOB in $(seq ${nj}); do
|
||||
{
|
||||
core_id=`expr $JOB - 1`
|
||||
taskset -c ${core_id} python ${inference_tool} --backend ${backend} --model_dir ${model_dir} --wav_file ${output_dir}/wav.$JOB.scp --quantize ${quantize} --output_dir ${output_dir}/${JOB} &> ${output_dir}/log.$JOB.txt
|
||||
}&
|
||||
|
||||
done
|
||||
wait
|
||||
|
||||
mkdir -p ${output_dir}/1best_recog
|
||||
for f in token text; do
|
||||
if [ -f "${output_dir}/1/${f}" ]; then
|
||||
for JOB in $(seq "${nj}"); do
|
||||
cat "${output_dir}/${JOB}/${f}"
|
||||
done | sort -k1 >"${output_dir}/1best_recog/${f}"
|
||||
fi
|
||||
done
|
||||
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
|
||||
echo "Computing WER ..."
|
||||
python ${proce_text_tool} ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
|
||||
python ${proce_text_tool} ${label_text} ${output_dir}/1best_recog/text.ref
|
||||
python ${compute_wer_tool} ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
|
||||
tail -n 3 ${output_dir}/1best_recog/text.cer
|
||||
fi
|
||||
|
||||
55
funasr_local/runtime/python/utils/test_rtf.py
Normal file
55
funasr_local/runtime/python/utils/test_rtf.py
Normal file
@@ -0,0 +1,55 @@
|
||||
|
||||
import time
|
||||
import sys
|
||||
import librosa
|
||||
from funasr_local.utils.types import str2bool
|
||||
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model_dir', type=str, required=True)
|
||||
parser.add_argument('--backend', type=str, default='onnx', help='["onnx", "torch"]')
|
||||
parser.add_argument('--wav_file', type=str, default=None, help='amp fallback number')
|
||||
parser.add_argument('--quantize', type=str2bool, default=False, help='quantized model')
|
||||
parser.add_argument('--intra_op_num_threads', type=int, default=1, help='intra_op_num_threads for onnx')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
from funasr_local.runtime.python.libtorch.funasr_local_torch import Paraformer
|
||||
if args.backend == "onnx":
|
||||
from funasr_local.runtime.python.onnxruntime.funasr_local_onnx import Paraformer
|
||||
|
||||
model = Paraformer(args.model_dir, batch_size=1, quantize=args.quantize, intra_op_num_threads=args.intra_op_num_threads)
|
||||
|
||||
wav_file_f = open(args.wav_file, 'r')
|
||||
wav_files = wav_file_f.readlines()
|
||||
|
||||
# warm-up
|
||||
total = 0.0
|
||||
num = 30
|
||||
wav_path = wav_files[0].split("\t")[1].strip() if "\t" in wav_files[0] else wav_files[0].split(" ")[1].strip()
|
||||
for i in range(num):
|
||||
beg_time = time.time()
|
||||
result = model(wav_path)
|
||||
end_time = time.time()
|
||||
duration = end_time-beg_time
|
||||
total += duration
|
||||
print(result)
|
||||
print("num: {}, time, {}, avg: {}, rtf: {}".format(len(wav_path), duration, total/(i+1), (total/(i+1))/5.53))
|
||||
|
||||
# infer time
|
||||
beg_time = time.time()
|
||||
for i, wav_path_i in enumerate(wav_files):
|
||||
wav_path = wav_path_i.split("\t")[1].strip() if "\t" in wav_path_i else wav_path_i.split(" ")[1].strip()
|
||||
result = model(wav_path)
|
||||
end_time = time.time()
|
||||
duration = (end_time-beg_time)*1000
|
||||
print("total_time_comput_ms: {}".format(int(duration)))
|
||||
|
||||
duration_time = 0.0
|
||||
for i, wav_path_i in enumerate(wav_files):
|
||||
wav_path = wav_path_i.split("\t")[1].strip() if "\t" in wav_path_i else wav_path_i.split(" ")[1].strip()
|
||||
waveform, _ = librosa.load(wav_path, sr=16000)
|
||||
duration_time += len(waveform)/16.0
|
||||
print("total_time_wav_ms: {}".format(int(duration_time)))
|
||||
|
||||
print("total_rtf: {:.5}".format(duration/duration_time))
|
||||
71
funasr_local/runtime/python/utils/test_rtf.sh
Normal file
71
funasr_local/runtime/python/utils/test_rtf.sh
Normal file
@@ -0,0 +1,71 @@
|
||||
|
||||
nj=32
|
||||
stage=0
|
||||
|
||||
scp="/nfs/haoneng.lhn/funasr_data/aishell-1/data/test/wav.scp"
|
||||
export_root="/nfs/zhifu.gzf/export"
|
||||
split_scps_tool=split_scp.pl
|
||||
rtf_tool=test_rtf.py
|
||||
|
||||
#:<<!
|
||||
model_name="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||
backend="onnx" # "torch"
|
||||
quantize='true' # 'False'
|
||||
tag=${model_name}/${backend}_quantize_${quantize}
|
||||
!
|
||||
|
||||
logs_outputs_dir=${export_root}/logs/${tag}/split$nj
|
||||
mkdir -p ${logs_outputs_dir}
|
||||
echo ${logs_outputs_dir}
|
||||
|
||||
|
||||
if [ ${stage} -le 0 ];then
|
||||
|
||||
python -m funasr.export.export_model --model-name ${model_name} --export-dir ${export_root} --type ${backend} --quantize ${quantize} --audio_in ${scp}
|
||||
|
||||
fi
|
||||
|
||||
|
||||
if [ ${stage} -le 1 ];then
|
||||
|
||||
model_dir=${export_root}/${model_name}
|
||||
split_scps=""
|
||||
for JOB in $(seq ${nj}); do
|
||||
split_scps="$split_scps $logs_outputs_dir/wav.$JOB.scp"
|
||||
done
|
||||
|
||||
perl ${split_scps_tool} $scp ${split_scps}
|
||||
|
||||
|
||||
for JOB in $(seq ${nj}); do
|
||||
{
|
||||
core_id=`expr $JOB - 1`
|
||||
taskset -c ${core_id} python ${rtf_tool} --backend ${backend} --model_dir ${model_dir} --wav_file ${logs_outputs_dir}/wav.$JOB.scp --quantize ${quantize} &> ${logs_outputs_dir}/log.$JOB.txt
|
||||
}&
|
||||
|
||||
done
|
||||
wait
|
||||
|
||||
|
||||
rm -rf ${logs_outputs_dir}/total_time_comput.txt
|
||||
rm -rf ${logs_outputs_dir}/total_time_wav.txt
|
||||
rm -rf ${logs_outputs_dir}/total_rtf.txt
|
||||
for JOB in $(seq ${nj}); do
|
||||
{
|
||||
cat ${logs_outputs_dir}/log.$JOB.txt | grep "total_time_comput" | awk -F ' ' '{print $2}' >> ${logs_outputs_dir}/total_time_comput.txt
|
||||
cat ${logs_outputs_dir}/log.$JOB.txt | grep "total_time_wav" | awk -F ' ' '{print $2}' >> ${logs_outputs_dir}/total_time_wav.txt
|
||||
cat ${logs_outputs_dir}/log.$JOB.txt | grep "total_rtf" | awk -F ' ' '{print $2}' >> ${logs_outputs_dir}/total_rtf.txt
|
||||
}
|
||||
|
||||
done
|
||||
|
||||
total_time_comput=`cat ${logs_outputs_dir}/total_time_comput.txt | awk 'BEGIN {max = 0} {if ($1+0>max+0) max=$1 fi} END {print max}'`
|
||||
total_time_wav=`cat ${logs_outputs_dir}/total_time_wav.txt | awk '{sum +=$1};END {print sum}'`
|
||||
rtf=`awk 'BEGIN{printf "%.5f\n",'$total_time_comput'/'$total_time_wav'}'`
|
||||
speed=`awk 'BEGIN{printf "%.2f\n",1/'$rtf'}'`
|
||||
|
||||
echo "total_time_comput_ms: $total_time_comput"
|
||||
echo "total_time_wav: $total_time_wav"
|
||||
echo "total_rtf: $rtf, speech: $speed"
|
||||
|
||||
fi
|
||||
58
funasr_local/runtime/python/utils/test_rtf_gpu.py
Normal file
58
funasr_local/runtime/python/utils/test_rtf_gpu.py
Normal file
@@ -0,0 +1,58 @@
|
||||
|
||||
import time
|
||||
import sys
|
||||
import librosa
|
||||
from funasr_local.utils.types import str2bool
|
||||
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model_dir', type=str, required=True)
|
||||
parser.add_argument('--backend', type=str, default='onnx', help='["onnx", "torch"]')
|
||||
parser.add_argument('--wav_file', type=str, default=None, help='amp fallback number')
|
||||
parser.add_argument('--quantize', type=str2bool, default=False, help='quantized model')
|
||||
parser.add_argument('--intra_op_num_threads', type=int, default=1, help='intra_op_num_threads for onnx')
|
||||
parser.add_argument('--batch_size', type=int, default=1, help='batch_size for onnx')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
from funasr_local.runtime.python.libtorch.funasr_local_torch import Paraformer
|
||||
if args.backend == "onnx":
|
||||
from funasr_local.runtime.python.onnxruntime.funasr_local_onnx import Paraformer
|
||||
|
||||
model = Paraformer(args.model_dir, batch_size=args.batch_size, quantize=args.quantize, intra_op_num_threads=args.intra_op_num_threads)
|
||||
|
||||
wav_file_f = open(args.wav_file, 'r')
|
||||
wav_files = wav_file_f.readlines()
|
||||
|
||||
# warm-up
|
||||
total = 0.0
|
||||
num = 30
|
||||
wav_path = wav_files[0].split("\t")[1].strip() if "\t" in wav_files[0] else wav_files[0].split(" ")[1].strip()
|
||||
for i in range(num):
|
||||
beg_time = time.time()
|
||||
result = model(wav_path)
|
||||
end_time = time.time()
|
||||
duration = end_time-beg_time
|
||||
total += duration
|
||||
print(result)
|
||||
print("num: {}, time, {}, avg: {}, rtf: {}".format(len(wav_path), duration, total/(i+1), (total/(i+1))/5.53))
|
||||
|
||||
# infer time
|
||||
wav_path = []
|
||||
beg_time = time.time()
|
||||
for i, wav_path_i in enumerate(wav_files):
|
||||
wav_path_i = wav_path_i.split("\t")[1].strip() if "\t" in wav_path_i else wav_path_i.split(" ")[1].strip()
|
||||
wav_path += [wav_path_i]
|
||||
result = model(wav_path)
|
||||
end_time = time.time()
|
||||
duration = (end_time-beg_time)*1000
|
||||
print("total_time_comput_ms: {}".format(int(duration)))
|
||||
|
||||
duration_time = 0.0
|
||||
for i, wav_path_i in enumerate(wav_files):
|
||||
wav_path = wav_path_i.split("\t")[1].strip() if "\t" in wav_path_i else wav_path_i.split(" ")[1].strip()
|
||||
waveform, _ = librosa.load(wav_path, sr=16000)
|
||||
duration_time += len(waveform)/16.0
|
||||
print("total_time_wav_ms: {}".format(int(duration_time)))
|
||||
|
||||
print("total_rtf: {:.5}".format(duration/duration_time))
|
||||
67
funasr_local/runtime/python/websocket/README.md
Normal file
67
funasr_local/runtime/python/websocket/README.md
Normal file
@@ -0,0 +1,67 @@
|
||||
# Service with websocket-python
|
||||
|
||||
This is a demo using funasr pipeline with websocket python-api.
|
||||
|
||||
## For the Server
|
||||
|
||||
### Install the modelscope and funasr
|
||||
|
||||
```shell
|
||||
pip install -U modelscope funasr
|
||||
# For the users in China, you could install with the command:
|
||||
# pip install -U modelscope funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
|
||||
git clone https://github.com/alibaba/FunASR.git && cd FunASR
|
||||
```
|
||||
|
||||
### Install the requirements for server
|
||||
|
||||
```shell
|
||||
cd funasr/runtime/python/websocket
|
||||
pip install -r requirements_server.txt
|
||||
```
|
||||
|
||||
### Start server
|
||||
#### ASR offline server
|
||||
|
||||
[//]: # (```shell)
|
||||
|
||||
[//]: # (python ws_server_online.py --host "0.0.0.0" --port 10095 --asr_model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
|
||||
|
||||
[//]: # (```)
|
||||
#### ASR streaming server
|
||||
```shell
|
||||
python ws_server_online.py --host "0.0.0.0" --port 10095 --asr_model_online "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online"
|
||||
```
|
||||
|
||||
#### ASR offline/online 2pass server
|
||||
|
||||
[//]: # (```shell)
|
||||
|
||||
[//]: # (python ws_server_online.py --host "0.0.0.0" --port 10095 --asr_model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
|
||||
|
||||
[//]: # (```)
|
||||
|
||||
## For the client
|
||||
|
||||
Install the requirements for client
|
||||
```shell
|
||||
git clone https://github.com/alibaba/FunASR.git && cd FunASR
|
||||
cd funasr/runtime/python/websocket
|
||||
pip install -r requirements_client.txt
|
||||
```
|
||||
|
||||
### Start client
|
||||
#### Recording from mircrophone
|
||||
```shell
|
||||
# --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
|
||||
python ws_client.py --host "127.0.0.1" --port 10095 --chunk_size "5,10,5"
|
||||
```
|
||||
#### Loadding from wav.scp(kaldi style)
|
||||
```shell
|
||||
# --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
|
||||
python ws_client.py --host "127.0.0.1" --port 10095 --chunk_size "5,10,5" --audio_in "./data/wav.scp"
|
||||
```
|
||||
|
||||
## Acknowledge
|
||||
1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
|
||||
2. We acknowledge [cgisky1980](https://github.com/cgisky1980/FunASR) for contributing the websocket service.
|
||||
35
funasr_local/runtime/python/websocket/parse_args.py
Normal file
35
funasr_local/runtime/python/websocket/parse_args.py
Normal file
@@ -0,0 +1,35 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host",
|
||||
type=str,
|
||||
default="0.0.0.0",
|
||||
required=False,
|
||||
help="host ip, localhost, 0.0.0.0")
|
||||
parser.add_argument("--port",
|
||||
type=int,
|
||||
default=10095,
|
||||
required=False,
|
||||
help="grpc server port")
|
||||
parser.add_argument("--asr_model",
|
||||
type=str,
|
||||
default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
||||
help="model from modelscope")
|
||||
parser.add_argument("--asr_model_online",
|
||||
type=str,
|
||||
default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online",
|
||||
help="model from modelscope")
|
||||
parser.add_argument("--vad_model",
|
||||
type=str,
|
||||
default="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
||||
help="model from modelscope")
|
||||
parser.add_argument("--punc_model",
|
||||
type=str,
|
||||
default="damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727",
|
||||
help="model from modelscope")
|
||||
parser.add_argument("--ngpu",
|
||||
type=int,
|
||||
default=1,
|
||||
help="0 for cpu, 1 for gpu")
|
||||
|
||||
args = parser.parse_args()
|
||||
@@ -0,0 +1,2 @@
|
||||
websockets
|
||||
pyaudio
|
||||
@@ -0,0 +1 @@
|
||||
websockets
|
||||
182
funasr_local/runtime/python/websocket/ws_client.py
Normal file
182
funasr_local/runtime/python/websocket/ws_client.py
Normal file
@@ -0,0 +1,182 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
import os
|
||||
import time
|
||||
import websockets
|
||||
import asyncio
|
||||
# import threading
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host",
|
||||
type=str,
|
||||
default="localhost",
|
||||
required=False,
|
||||
help="host ip, localhost, 0.0.0.0")
|
||||
parser.add_argument("--port",
|
||||
type=int,
|
||||
default=10095,
|
||||
required=False,
|
||||
help="grpc server port")
|
||||
parser.add_argument("--chunk_size",
|
||||
type=str,
|
||||
default="5, 10, 5",
|
||||
help="chunk")
|
||||
parser.add_argument("--chunk_interval",
|
||||
type=int,
|
||||
default=10,
|
||||
help="chunk")
|
||||
parser.add_argument("--audio_in",
|
||||
type=str,
|
||||
default=None,
|
||||
help="audio_in")
|
||||
|
||||
args = parser.parse_args()
|
||||
args.chunk_size = [int(x) for x in args.chunk_size.split(",")]
|
||||
|
||||
# voices = asyncio.Queue()
|
||||
from queue import Queue
|
||||
voices = Queue()
|
||||
|
||||
# 其他函数可以通过调用send(data)来发送数据,例如:
|
||||
async def record_microphone():
|
||||
is_finished = False
|
||||
import pyaudio
|
||||
#print("2")
|
||||
global voices
|
||||
FORMAT = pyaudio.paInt16
|
||||
CHANNELS = 1
|
||||
RATE = 16000
|
||||
chunk_size = 60*args.chunk_size[1]/args.chunk_interval
|
||||
CHUNK = int(RATE / 1000 * chunk_size)
|
||||
|
||||
p = pyaudio.PyAudio()
|
||||
|
||||
stream = p.open(format=FORMAT,
|
||||
channels=CHANNELS,
|
||||
rate=RATE,
|
||||
input=True,
|
||||
frames_per_buffer=CHUNK)
|
||||
is_speaking = True
|
||||
while True:
|
||||
|
||||
data = stream.read(CHUNK)
|
||||
data = data.decode('ISO-8859-1')
|
||||
message = json.dumps({"chunk_size": args.chunk_size, "chunk_interval": args.chunk_interval, "audio": data, "is_speaking": is_speaking, "is_finished": is_finished})
|
||||
|
||||
voices.put(message)
|
||||
#print(voices.qsize())
|
||||
|
||||
await asyncio.sleep(0.005)
|
||||
|
||||
# 其他函数可以通过调用send(data)来发送数据,例如:
|
||||
async def record_from_scp():
|
||||
import wave
|
||||
global voices
|
||||
is_finished = False
|
||||
if args.audio_in.endswith(".scp"):
|
||||
f_scp = open(args.audio_in)
|
||||
wavs = f_scp.readlines()
|
||||
else:
|
||||
wavs = [args.audio_in]
|
||||
for wav in wavs:
|
||||
wav_splits = wav.strip().split()
|
||||
wav_path = wav_splits[1] if len(wav_splits) > 1 else wav_splits[0]
|
||||
# bytes_f = open(wav_path, "rb")
|
||||
# bytes_data = bytes_f.read()
|
||||
with wave.open(wav_path, "rb") as wav_file:
|
||||
# 获取音频参数
|
||||
params = wav_file.getparams()
|
||||
# 获取头信息的长度
|
||||
# header_length = wav_file.getheaders()[0][1]
|
||||
# 读取音频帧数据,跳过头信息
|
||||
# wav_file.setpos(header_length)
|
||||
frames = wav_file.readframes(wav_file.getnframes())
|
||||
|
||||
# 将音频帧数据转换为字节类型的数据
|
||||
audio_bytes = bytes(frames)
|
||||
# stride = int(args.chunk_size/1000*16000*2)
|
||||
stride = int(60*args.chunk_size[1]/args.chunk_interval/1000*16000*2)
|
||||
chunk_num = (len(audio_bytes)-1)//stride + 1
|
||||
# print(stride)
|
||||
is_speaking = True
|
||||
for i in range(chunk_num):
|
||||
if i == chunk_num-1:
|
||||
is_speaking = False
|
||||
beg = i*stride
|
||||
data = audio_bytes[beg:beg+stride]
|
||||
data = data.decode('ISO-8859-1')
|
||||
message = json.dumps({"chunk_size": args.chunk_size, "chunk_interval": args.chunk_interval, "is_speaking": is_speaking, "audio": data, "is_finished": is_finished})
|
||||
voices.put(message)
|
||||
# print("data_chunk: ", len(data_chunk))
|
||||
# print(voices.qsize())
|
||||
|
||||
await asyncio.sleep(60*args.chunk_size[1]/args.chunk_interval/1000)
|
||||
|
||||
is_finished = True
|
||||
message = json.dumps({"is_finished": is_finished})
|
||||
voices.put(message)
|
||||
|
||||
async def ws_send():
|
||||
global voices
|
||||
global websocket
|
||||
print("started to sending data!")
|
||||
while True:
|
||||
while not voices.empty():
|
||||
data = voices.get()
|
||||
voices.task_done()
|
||||
try:
|
||||
await websocket.send(data) # 通过ws对象发送数据
|
||||
except Exception as e:
|
||||
print('Exception occurred:', e)
|
||||
await asyncio.sleep(0.005)
|
||||
await asyncio.sleep(0.005)
|
||||
|
||||
|
||||
|
||||
async def message():
|
||||
global websocket
|
||||
text_print = ""
|
||||
while True:
|
||||
try:
|
||||
meg = await websocket.recv()
|
||||
meg = json.loads(meg)
|
||||
# print(meg, end = '')
|
||||
# print("\r")
|
||||
text = meg["text"][0]
|
||||
text_print += text
|
||||
text_print = text_print[-55:]
|
||||
os.system('clear')
|
||||
print("\r"+text_print)
|
||||
except Exception as e:
|
||||
print("Exception:", e)
|
||||
|
||||
|
||||
async def print_messge():
|
||||
global websocket
|
||||
while True:
|
||||
try:
|
||||
meg = await websocket.recv()
|
||||
meg = json.loads(meg)
|
||||
print(meg)
|
||||
except Exception as e:
|
||||
print("Exception:", e)
|
||||
|
||||
|
||||
async def ws_client():
|
||||
global websocket # 定义一个全局变量ws,用于保存websocket连接对象
|
||||
# uri = "ws://11.167.134.197:8899"
|
||||
uri = "ws://{}:{}".format(args.host, args.port)
|
||||
#ws = await websockets.connect(uri, subprotocols=["binary"]) # 创建一个长连接
|
||||
async for websocket in websockets.connect(uri, subprotocols=["binary"], ping_interval=None):
|
||||
if args.audio_in is not None:
|
||||
task = asyncio.create_task(record_from_scp()) # 创建一个后台任务录音
|
||||
else:
|
||||
task = asyncio.create_task(record_microphone()) # 创建一个后台任务录音
|
||||
task2 = asyncio.create_task(ws_send()) # 创建一个后台任务发送
|
||||
task3 = asyncio.create_task(message()) # 创建一个后台接收消息的任务
|
||||
await asyncio.gather(task, task2, task3)
|
||||
|
||||
|
||||
asyncio.get_event_loop().run_until_complete(ws_client()) # 启动协程
|
||||
asyncio.get_event_loop().run_forever()
|
||||
108
funasr_local/runtime/python/websocket/ws_server_online.py
Normal file
108
funasr_local/runtime/python/websocket/ws_server_online.py
Normal file
@@ -0,0 +1,108 @@
|
||||
import asyncio
|
||||
import json
|
||||
import websockets
|
||||
import time
|
||||
from queue import Queue
|
||||
import threading
|
||||
import logging
|
||||
import tracemalloc
|
||||
import numpy as np
|
||||
|
||||
from parse_args import args
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
from funasr_local_onnx.utils.frontend import load_bytes
|
||||
|
||||
tracemalloc.start()
|
||||
|
||||
logger = get_logger(log_level=logging.CRITICAL)
|
||||
logger.setLevel(logging.CRITICAL)
|
||||
|
||||
|
||||
websocket_users = set()
|
||||
|
||||
|
||||
print("model loading")
|
||||
|
||||
inference_pipeline_asr_online = pipeline(
|
||||
task=Tasks.auto_speech_recognition,
|
||||
model=args.asr_model_online,
|
||||
model_revision='v1.0.4')
|
||||
|
||||
print("model loaded")
|
||||
|
||||
|
||||
|
||||
async def ws_serve(websocket, path):
|
||||
frames_online = []
|
||||
global websocket_users
|
||||
websocket.send_msg = Queue()
|
||||
websocket_users.add(websocket)
|
||||
websocket.param_dict_asr_online = {"cache": dict()}
|
||||
websocket.speek_online = Queue()
|
||||
ss_online = threading.Thread(target=asr_online, args=(websocket,))
|
||||
ss_online.start()
|
||||
|
||||
try:
|
||||
async for message in websocket:
|
||||
message = json.loads(message)
|
||||
is_finished = message["is_finished"]
|
||||
if not is_finished:
|
||||
audio = bytes(message['audio'], 'ISO-8859-1')
|
||||
|
||||
is_speaking = message["is_speaking"]
|
||||
websocket.param_dict_asr_online["is_final"] = not is_speaking
|
||||
|
||||
websocket.param_dict_asr_online["chunk_size"] = message["chunk_size"]
|
||||
|
||||
|
||||
frames_online.append(audio)
|
||||
|
||||
if len(frames_online) % message["chunk_interval"] == 0 or not is_speaking:
|
||||
|
||||
audio_in = b"".join(frames_online)
|
||||
websocket.speek_online.put(audio_in)
|
||||
frames_online = []
|
||||
|
||||
if not websocket.send_msg.empty():
|
||||
await websocket.send(websocket.send_msg.get())
|
||||
websocket.send_msg.task_done()
|
||||
|
||||
|
||||
except websockets.ConnectionClosed:
|
||||
print("ConnectionClosed...", websocket_users) # 链接断开
|
||||
websocket_users.remove(websocket)
|
||||
except websockets.InvalidState:
|
||||
print("InvalidState...") # 无效状态
|
||||
except Exception as e:
|
||||
print("Exception:", e)
|
||||
|
||||
|
||||
|
||||
def asr_online(websocket): # ASR推理
|
||||
global websocket_users
|
||||
while websocket in websocket_users:
|
||||
if not websocket.speek_online.empty():
|
||||
audio_in = websocket.speek_online.get()
|
||||
websocket.speek_online.task_done()
|
||||
if len(audio_in) > 0:
|
||||
# print(len(audio_in))
|
||||
audio_in = load_bytes(audio_in)
|
||||
rec_result = inference_pipeline_asr_online(audio_in=audio_in,
|
||||
param_dict=websocket.param_dict_asr_online)
|
||||
if websocket.param_dict_asr_online["is_final"]:
|
||||
websocket.param_dict_asr_online["cache"] = dict()
|
||||
|
||||
if "text" in rec_result:
|
||||
if rec_result["text"] != "sil" and rec_result["text"] != "waiting_for_more_voice":
|
||||
print(rec_result["text"])
|
||||
message = json.dumps({"mode": "online", "text": rec_result["text"]})
|
||||
websocket.send_msg.put(message)
|
||||
|
||||
time.sleep(0.005)
|
||||
|
||||
|
||||
start_server = websockets.serve(ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None)
|
||||
asyncio.get_event_loop().run_until_complete(start_server)
|
||||
asyncio.get_event_loop().run_forever()
|
||||
Reference in New Issue
Block a user