[feat] update some feature

sync code of  fastrtc,
add text support through datachannel,
fix safari connect problem
support chat without camera or mic
This commit is contained in:
huangbinchao.hbc
2025-03-25 18:05:10 +08:00
parent e1fb40a8a8
commit aefb08150f
222 changed files with 28698 additions and 5889 deletions

View File

@@ -0,0 +1,3 @@
from .tts import KokoroTTSOptions, get_tts_model
__all__ = ["get_tts_model", "KokoroTTSOptions"]

View File

@@ -0,0 +1,13 @@
from fastrtc.text_to_speech.tts import get_tts_model
def test_tts_long_prompt():
model = get_tts_model()
prompt = "It may be that this communication will be considered as a madman's freak but at any rate it must be admitted that in its clearness and frankness it left nothing to be desired The serious part of it was that the Federal Government had undertaken to treat a sale by auction as a valid concession of these undiscovered territories Opinions on the matter were many Some readers saw in it only one of those prodigious outbursts of American humbug which would exceed the limits of puffism if the depths of human credulity were not unfathomable"
for i, chunk in enumerate(model.stream_tts_sync(prompt)):
print(f"Chunk {i}: {chunk[1].shape}")
if __name__ == "__main__":
test_tts_long_prompt()

View File

@@ -0,0 +1,135 @@
import asyncio
import re
from dataclasses import dataclass
from functools import lru_cache
from typing import AsyncGenerator, Generator, Literal, Protocol
import numpy as np
from huggingface_hub import hf_hub_download
from numpy.typing import NDArray
class TTSOptions:
pass
class TTSModel(Protocol):
def tts(self, text: str) -> tuple[int, NDArray[np.float32]]: ...
async def stream_tts(
self, text: str, options: TTSOptions | None = None
) -> AsyncGenerator[tuple[int, NDArray[np.float32]], None]: ...
def stream_tts_sync(
self, text: str, options: TTSOptions | None = None
) -> Generator[tuple[int, NDArray[np.float32]], None, None]: ...
@dataclass
class KokoroTTSOptions(TTSOptions):
voice: str = "af_heart"
speed: float = 1.0
lang: str = "en-us"
@lru_cache
def get_tts_model(model: Literal["kokoro"] = "kokoro") -> TTSModel:
m = KokoroTTSModel()
m.tts("Hello, world!")
return m
class KokoroFixedBatchSize:
# Source: https://github.com/thewh1teagle/kokoro-onnx/issues/115#issuecomment-2676625392
def _split_phonemes(self, phonemes: str) -> list[str]:
MAX_PHONEME_LENGTH = 510
max_length = MAX_PHONEME_LENGTH - 1
batched_phonemes = []
while len(phonemes) > max_length:
# Find best split point within limit
split_idx = max_length
# Try to find the last period before max_length
period_idx = phonemes.rfind(".", 0, max_length)
if period_idx != -1:
split_idx = period_idx + 1 # Include period
else:
# Try other punctuation
match = re.search(
r"[!?;,]", phonemes[:max_length][::-1]
) # Search backwards
if match:
split_idx = max_length - match.start()
else:
# Try last space
space_idx = phonemes.rfind(" ", 0, max_length)
if space_idx != -1:
split_idx = space_idx
# If no good split point is found, force split at max_length
chunk = phonemes[:split_idx].strip()
batched_phonemes.append(chunk)
# Move to the next part
phonemes = phonemes[split_idx:].strip()
# Add remaining phonemes
if phonemes:
batched_phonemes.append(phonemes)
return batched_phonemes
class KokoroTTSModel(TTSModel):
def __init__(self):
from kokoro_onnx import Kokoro
self.model = Kokoro(
model_path=hf_hub_download("fastrtc/kokoro-onnx", "kokoro-v1.0.onnx"),
voices_path=hf_hub_download("fastrtc/kokoro-onnx", "voices-v1.0.bin"),
)
self.model._split_phonemes = KokoroFixedBatchSize()._split_phonemes
def tts(
self, text: str, options: KokoroTTSOptions | None = None
) -> tuple[int, NDArray[np.float32]]:
options = options or KokoroTTSOptions()
a, b = self.model.create(
text, voice=options.voice, speed=options.speed, lang=options.lang
)
return b, a
async def stream_tts(
self, text: str, options: KokoroTTSOptions | None = None
) -> AsyncGenerator[tuple[int, NDArray[np.float32]], None]:
options = options or KokoroTTSOptions()
sentences = re.split(r"(?<=[.!?])\s+", text.strip())
for s_idx, sentence in enumerate(sentences):
if not sentence.strip():
continue
chunk_idx = 0
async for chunk in self.model.create_stream(
sentence, voice=options.voice, speed=options.speed, lang=options.lang
):
if s_idx != 0 and chunk_idx == 0:
yield chunk[1], np.zeros(chunk[1] // 7, dtype=np.float32)
chunk_idx += 1
yield chunk[1], chunk[0]
def stream_tts_sync(
self, text: str, options: KokoroTTSOptions | None = None
) -> Generator[tuple[int, NDArray[np.float32]], None, None]:
loop = asyncio.new_event_loop()
# Use the new loop to run the async generator
iterator = self.stream_tts(text, options).__aiter__()
while True:
try:
yield loop.run_until_complete(iterator.__anext__())
except StopAsyncIteration:
break