stt models (#147)

2026-02-05 18:09:23 +08:00 · 2025-03-07 17:03:11 -05:00
parent cbbfa17679
commit 504eb452f0
6 changed files with 55 additions and 64 deletions
--- a/backend/fastrtc/speech_to_text/init.py
+++ b/backend/fastrtc/speech_to_text/init.py
@@ -1,3 +1,3 @@
-from .stt_ import MoonshineSTT, get_stt_model
+from .stt_ import MoonshineSTT, get_stt_model, stt_for_chunks

-__all__ = ["get_stt_model", "MoonshineSTT", "get_stt_model"]
+__all__ = ["get_stt_model", "MoonshineSTT", "get_stt_model", "stt_for_chunks"]
--- a/backend/fastrtc/speech_to_text/stt_.py
+++ b/backend/fastrtc/speech_to_text/stt_.py
@@ -15,12 +15,6 @@ curr_dir = Path(__file__).parent
 class STTModel(Protocol):
    def stt(self, audio: tuple[int, NDArray[np.int16 | np.float32]]) -> str: ...

-    def stt_for_chunks(
-        self,
-        audio: tuple[int, NDArray[np.int16 | np.float32]],
-        chunks: list[AudioChunk],
-    ) -> str: ...
-

 class MoonshineSTT(STTModel):
    def __init__(
@@ -49,19 +43,6 @@ class MoonshineSTT(STTModel):
        tokens = self.model.generate(audio_np)
        return self.tokenizer.decode_batch(tokens)[0]

-    def stt_for_chunks(
-        self,
-        audio: tuple[int, NDArray[np.int16 | np.float32]],
-        chunks: list[AudioChunk],
-    ) -> str:
-        sr, audio_np = audio
-        return " ".join(
-            [
-                self.stt((sr, audio_np[chunk["start"] : chunk["end"]]))
-                for chunk in chunks
-            ]
-        )
-

@lru_cache
 def get_stt_model(
@@ -79,3 +60,17 @@ def get_stt_model(
    m.stt((16000, audio))
    print(click.style("INFO", fg="green") + ":\t  STT model warmed up.")
    return m
+
+
+def stt_for_chunks(
+    stt_model: STTModel,
+    audio: tuple[int, NDArray[np.int16 | np.float32]],
+    chunks: list[AudioChunk],
+) -> str:
+    sr, audio_np = audio
+    return " ".join(
+        [
+            stt_model.stt((sr, audio_np[chunk["start"] : chunk["end"]]))
+            for chunk in chunks
+        ]
+    )