diff --git a/main.py b/main.py deleted file mode 100644 index d212dd3..0000000 --- a/main.py +++ /dev/null @@ -1,40 +0,0 @@ -import io,time -from fastapi import FastAPI, Response -from fastapi.responses import HTMLResponse -from cosyvoice.cli.cosyvoice import CosyVoice -import torchaudio - -cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT') -# sft usage -print(cosyvoice.list_avaliable_spks()) -app = FastAPI() - -@app.get("/api/voice/tts") -async def tts(query: str, role: str): - start = time.process_time() - output = cosyvoice.inference_sft(query, role) - end = time.process_time() - print("infer time:", end-start, "seconds") - buffer = io.BytesIO() - torchaudio.save(buffer, output['tts_speech'], 22050, format="wav") - buffer.seek(0) - return Response(content=buffer.read(-1), media_type="audio/wav") - -@app.get("/api/voice/roles") -async def roles(): - return {"roles": cosyvoice.list_avaliable_spks()} - -@app.get("/", response_class=HTMLResponse) -async def root(): - return """ - - - - - Api information - - - Get the supported tones from the Roles API first, then enter the tones and textual content in the TTS API for synthesis. Documents of API - - - """ diff --git a/runtime/python/fastapi_server.py b/runtime/python/fastapi_server.py new file mode 100644 index 0000000..f718373 --- /dev/null +++ b/runtime/python/fastapi_server.py @@ -0,0 +1,102 @@ +import os +import sys +import io,time +from fastapi import FastAPI, Response, File, UploadFile, Form +from fastapi.responses import HTMLResponse +from contextlib import asynccontextmanager +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append('{}/../..'.format(ROOT_DIR)) +sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR)) +from cosyvoice.cli.cosyvoice import CosyVoice +from cosyvoice.utils.file_utils import load_wav +import numpy as np +import torch +import torchaudio +import logging +logging.getLogger('matplotlib').setLevel(logging.WARNING) + +class LaunchFailed(Exception): + pass + +@asynccontextmanager +async def lifespan(app: FastAPI): + model_dir = os.getenv("MODEL_DIR", "pretrained_models/CosyVoice-300M-SFT") + if model_dir: + logging.info("MODEL_DIR is {}", model_dir) + app.cosyvoice = CosyVoice('../../'+model_dir) + # sft usage + logging.info("Avaliable speakers {}", app.cosyvoice.list_avaliable_spks()) + else: + raise LaunchFailed("MODEL_DIR environment must set") + yield + +app = FastAPI(lifespan=lifespan) + +def buildResponse(output): + buffer = io.BytesIO() + torchaudio.save(buffer, output, 22050, format="wav") + buffer.seek(0) + return Response(content=buffer.read(-1), media_type="audio/wav") + +@app.post("/api/inference/sft") +@app.get("/api/inference/sft") +async def sft(tts: str = Form(), role: str = Form()): + start = time.process_time() + output = app.cosyvoice.inference_sft(tts, role) + end = time.process_time() + logging.info("infer time is {} seconds", end-start) + return buildResponse(output['tts_speech']) + +@app.post("/api/inference/zero-shot") +async def zeroShot(tts: str = Form(), prompt: str = Form(), audio: UploadFile = File()): + start = time.process_time() + prompt_speech = load_wav(audio.file, 16000) + prompt_audio = (prompt_speech.numpy() * (2**15)).astype(np.int16).tobytes() + prompt_speech_16k = torch.from_numpy(np.array(np.frombuffer(prompt_audio, dtype=np.int16))).unsqueeze(dim=0) + prompt_speech_16k = prompt_speech_16k.float() / (2**15) + + output = app.cosyvoice.inference_zero_shot(tts, prompt, prompt_speech_16k) + end = time.process_time() + logging.info("infer time is {} seconds", end-start) + return buildResponse(output['tts_speech']) + +@app.post("/api/inference/cross-lingual") +async def crossLingual(tts: str = Form(), audio: UploadFile = File()): + start = time.process_time() + prompt_speech = load_wav(audio.file, 16000) + prompt_audio = (prompt_speech.numpy() * (2**15)).astype(np.int16).tobytes() + prompt_speech_16k = torch.from_numpy(np.array(np.frombuffer(prompt_audio, dtype=np.int16))).unsqueeze(dim=0) + prompt_speech_16k = prompt_speech_16k.float() / (2**15) + + output = app.cosyvoice.inference_cross_lingual(tts, prompt_speech_16k) + end = time.process_time() + logging.info("infer time is {} seconds", end-start) + return buildResponse(output['tts_speech']) + +@app.post("/api/inference/instruct") +@app.get("/api/inference/instruct") +async def instruct(tts: str = Form(), role: str = Form(), instruct: str = Form()): + start = time.process_time() + output = app.cosyvoice.inference_instruct(tts, role, instruct) + end = time.process_time() + logging.info("infer time is {} seconds", end-start) + return buildResponse(output['tts_speech']) + +@app.get("/api/roles") +async def roles(): + return {"roles": app.cosyvoice.list_avaliable_spks()} + +@app.get("/", response_class=HTMLResponse) +async def root(): + return """ + + + + + Api information + + + Get the supported tones from the Roles API first, then enter the tones and textual content in the TTS API for synthesis. Documents of API + + + """