diff --git a/.gitmodules b/.gitmodules index 7355327..3d1b157 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ -[submodule "third_party/AcademiCodec"] - path = third_party/AcademiCodec - url = https://github.com/yangdongchao/AcademiCodec.git [submodule "third_party/Matcha-TTS"] path = third_party/Matcha-TTS url = https://github.com/shivammehta25/Matcha-TTS.git diff --git a/README.md b/README.md index a9d28a8..0bee462 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ sudo yum install sox sox-devel **Model download** -We strongly recommand that you download our pretrained `CosyVoice-300M` `CosyVoice-300M-SFT` `CosyVoice-300M-Instruct` model and `speech_kantts_ttsfrd` resource. +We strongly recommand that you download our pretrained `CosyVoice-300M` `CosyVoice-300M-SFT` `CosyVoice-300M-Instruct` model and `CosyVoice-ttsfrd` resource. If you are expert in this field, and you are only interested in training your own CosyVoice model from scratch, you can skip this step. @@ -43,7 +43,7 @@ from modelscope import snapshot_download snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M') snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT') snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct') -snapshot_download('speech_tts/speech_kantts_ttsfrd', local_dir='pretrained_models/speech_kantts_ttsfrd') +snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd') ``` ``` sh @@ -52,12 +52,12 @@ mkdir -p pretrained_models git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct -git clone https://www.modelscope.cn/speech_tts/speech_kantts_ttsfrd.git pretrained_models/speech_kantts_ttsfrd +git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd ``` Unzip `ttsfrd` resouce and install `ttsfrd` package ``` sh -cd pretrained_models/speech_kantts_ttsfrd/ +cd pretrained_models/CosyVoice-ttsfrd/ unzip resource.zip -d . pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl ``` diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index 4b73285..c404c51 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -50,7 +50,7 @@ class CosyVoiceFrontEnd: self.inflect_parser = inflect.engine() self.frd = ttsfrd.TtsFrontendEngine() ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) - assert self.frd.initialize('{}/../../pretrained_models/speech_kantts_ttsfrd/resource'.format(ROOT_DIR)) is True, 'failed to initialize ttsfrd resource' + assert self.frd.initialize('{}/../../pretrained_models/CosyVoice-ttsfrd/resource'.format(ROOT_DIR)) is True, 'failed to initialize ttsfrd resource' self.frd.set_lang_type('pinyin') self.frd.enable_pinyin_mix(True) self.frd.set_breakmodel_index(1) diff --git a/cosyvoice/hifigan/generator.py b/cosyvoice/hifigan/generator.py index aa8c7ee..a45419b 100644 --- a/cosyvoice/hifigan/generator.py +++ b/cosyvoice/hifigan/generator.py @@ -27,8 +27,8 @@ from torch.nn.utils import weight_norm from torch.distributions.uniform import Uniform from cosyvoice.transformer.activation import Snake -from academicodec.utils import get_padding -from academicodec.utils import init_weights +from cosyvoice.utils.common import get_padding +from cosyvoice.utils.common import init_weights """hifigan based generator implementation. diff --git a/cosyvoice/utils/common.py b/cosyvoice/utils/common.py index 73b438e..6ec5e17 100644 --- a/cosyvoice/utils/common.py +++ b/cosyvoice/utils/common.py @@ -91,3 +91,13 @@ def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) denominator = torch.sum(mask) return (numerator / denominator).detach() + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) diff --git a/runtime/python/Dockerfile b/runtime/python/Dockerfile index a9a43a5..a3269f8 100644 --- a/runtime/python/Dockerfile +++ b/runtime/python/Dockerfile @@ -5,8 +5,11 @@ WORKDIR /opt/CosyVoice RUN sed -i s@/archive.ubuntu.com/@/mirrors.aliyun.com/@g /etc/apt/sources.list RUN apt-get update -y -RUN apt-get -y install python3-dev cmake python3-pip git +RUN apt-get -y install python3-dev cmake python3-pip git unzip RUN git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git -RUN cd CosyVoice && pip3 install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com +RUN cd CosyVoice && pip3 install --default-timeout=3600 -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com +RUN apt install git-lfs && git lfs install +RUN cd CosyVoice && git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd +RUN cd CosyVoice/pretrained_models/CosyVoice-ttsfrd && unzip resource.zip -d . && pip3 install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl RUN cd CosyVoice/runtime/python && python3 -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. cosyvoice.proto CMD ["/bin/bash", "-c", "cd /opt/CosyVoice/CosyVoice/runtime/python && . ./path/sh && python3 server.py --port 50000 --max_conc 4 --model_dir speech_tts/CosyVoice-300M && sleep infinity"] \ No newline at end of file diff --git a/third_party/AcademiCodec b/third_party/AcademiCodec deleted file mode 160000 index b6ac134..0000000 --- a/third_party/AcademiCodec +++ /dev/null @@ -1 +0,0 @@ -Subproject commit b6ac134735f6079543db959a60eb77a7bab4277b