add download models script and fastapi server to serve tts

2026-02-05 09:59:23 +08:00 · 2024-07-08 18:51:06 +08:00
parent 4e43a9d98b
commit fff6f9f1e0
4 changed files with 61 additions and 9 deletions
--- a/README.md
+++ b/README.md
@@ -37,17 +37,13 @@ We strongly recommend that you download our pretrained `CosyVoice-300M` `CosyVoi
 If you are expert in this field, and you are only interested in training your own CosyVoice model from scratch, you can skip this step.
-``` python
+Download models with python script.
-# SDK模型下载
+``` shell
-from modelscope import snapshot_download
+python download.py
 snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
 snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
 snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
 snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
 ```
 Download models with git, you should install `git lfs` first.
 ``` sh
 # git模型下载，请确保已安装git lfs
 mkdir -p pretrained_models
 git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
 git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
@@ -120,6 +116,14 @@ python3 webui.py --port 50000 --model_dir pretrained_models/CosyVoice-300M
 For advanced user, we have provided train and inference scripts in `examples/libritts/cosyvoice/run.sh`.
 You can get familiar with CosyVoice following this recipie.
 **Serve with FastAPI**
 ```sh
 # For development
 fastapi dev --port 3003
 # For production
 fastapi run --port 3003
 ```
 **Build for deployment**
 Optionally, if you want to use grpc for service deployment,
--- a/download.py
+++ b/download.py
@@ -0,0 +1,6 @@
 # SDK模型下载
 from modelscope import snapshot_download
 snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
 snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
 snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
 snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
--- a/main.py
+++ b/main.py
@@ -0,0 +1,40 @@
 import io,time
 from fastapi import FastAPI, Response
 from fastapi.responses import HTMLResponse
 from cosyvoice.cli.cosyvoice import CosyVoice
 import torchaudio
 cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT')
 # sft usage
 print(cosyvoice.list_avaliable_spks())
 app = FastAPI()
@app.get("/api/voice/tts")
 async def tts(query: str, role: str):
    start = time.process_time()
    output = cosyvoice.inference_sft(query, role)
    end = time.process_time()
    print("infer time:", end-start, "seconds")
    buffer = io.BytesIO()
    torchaudio.save(buffer, output['tts_speech'], 22050, format="wav")
    buffer.seek(0)
    return Response(content=buffer.read(-1), media_type="audio/wav")
@app.get("/api/voice/roles")
 async def roles():
    return {"roles": cosyvoice.list_avaliable_spks()}
@app.get("/", response_class=HTMLResponse)
 async def root():
    return """
    <!DOCTYPE html>
    <html lang=zh-cn>
        <head>
            <meta charset=utf-8>
            <title>Api information</title>
        </head>
        <body>
            Get the supported tones from the Roles API first, then enter the tones and textual content in the TTS API for synthesis. <a href='./docs'>Documents of API</a>
        </body>
    </html>
    """
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,4 +25,6 @@ soundfile==0.12.1
 tensorboard==2.14.0
 torch==2.0.1
 torchaudio==2.0.2
-wget==3.2
+wget==3.2
 fastapi==0.111.0
 fastapi-cli==0.0.4