Added Script to use Ollama and run locally (#111)

* Uses Ollama / Qwen 2.5 to run voice activity locally * git push origin test * format --------- Co-authored-by: easytop <jgwoodward@ymail.com> Co-authored-by: Freddy Boulton <freddyboulton@hf-freddy.local>
2026-02-04 09:29:23 +08:00 · 2025-03-02 15:10:31 -08:00
parent e6193b0d05
commit 25c2152304
2 changed files with 130 additions and 4 deletions
--- a/demo/talk_to_smolagents/app.py
+++ b/demo/talk_to_smolagents/app.py
@@ -1,15 +1,15 @@
 from pathlib import Path
-from typing import List, Dict
+from typing import Dict, List

 from dotenv import load_dotenv
 from fastrtc import (
+    ReplyOnPause,
+    Stream,
    get_stt_model,
    get_tts_model,
-    Stream,
-    ReplyOnPause,
    get_twilio_turn_credentials,
 )
-from smolagents import CodeAgent, HfApiModel, DuckDuckGoSearchTool
+from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel

 # Load environment variables
 load_dotenv()
--- a/demo/voice_text_editor_local/app.py
+++ b/demo/voice_text_editor_local/app.py
@@ -0,0 +1,126 @@
+import os
+
+import gradio as gr
+import requests
+from dotenv import load_dotenv
+from fastrtc import AdditionalOutputs, ReplyOnPause, Stream, get_stt_model
+
+load_dotenv()
+
+stt_model = get_stt_model()
+
+SYSTEM_PROMPT = """You are an intelligent voice-activated text editor assistant. Your purpose is to help users create and modify text documents through voice commands.
+
+For each interaction:
+1. You will receive the current state of a text document and a voice input from the user.
+2. Determine if the input is:
+   a) A command to modify the document (e.g., "delete the last line", "capitalize that")
+   b) Content to be added to the document (e.g., "buy 12 eggs at the store")
+   c) A modification to existing content (e.g., "actually make that 24" to change "12" to "24")
+3. Return ONLY the new document state after the changes have been applied.
+
+Example:
+
+CURRENT DOCUMENT:
+
+Meeting notes:
+- Buy GPUs
+- Meet with Joe
+
+USER INPUT: Make that 100 GPUS
+
+NEW DOCUMENT STATE:
+
+Meeting notes:
+- Buy 100 GPUs
+- Meet with Joe
+
+Example 2:
+
+CURRENT DOCUMENT:
+
+Project Proposal
+
+USER INPUT: Make that a header
+
+NEW DOCUMENT STATE:
+
+# Project Proposal
+
+When handling commands:
+- Apply the requested changes precisely to the document
+- Support operations like adding, deleting, modifying, and moving text
+- Understand contextual references like "that", "the last line", "the second paragraph"
+
+When handling content additions:
+- Add the new text at the appropriate location (usually at the end or cursor position)
+- Format it appropriately based on the document context
+- If the user says to "add" or "insert" do not remove text that was already in the document.
+
+When handling content modifications:
+- Identify what part of the document the user is referring to
+- Apply the requested change while preserving the rest of the content
+- Be smart about contextual references (e.g., "make that 24" should know to replace a number)
+
+NEVER include any text in the new document state that is not part of the user's input.
+NEVER include the phrase "CURRENT DOCUMENT" in the new document state.
+NEVER reword the user's input unless you are explicitly asked to do so.
+"""
+
+
+def edit(audio, current_document: str):
+    prompt = stt_model.stt(audio)
+    print(f"Prompt: {prompt}")
+
+    # Construct the prompt for ollama
+    full_prompt = (
+        f"{SYSTEM_PROMPT}\n\n"
+        f"User: CURRENT DOCUMENT:\n\n{current_document}\n\nUSER INPUT: {prompt}\n\n"
+        f"Assistant:"
+    )
+
+    try:
+        # Send request to ollama's API
+        response = requests.post(
+            "http://localhost:11434/api/generate",
+            json={
+                "model": "qwen2.5",
+                "prompt": full_prompt,
+                "stream": False,
+                "max_tokens": 200,
+            },
+        )
+        response.raise_for_status()  # Raise an exception for bad status codes
+
+        # Parse the response
+        doc = response.json()["response"]
+        # Clean up the response to remove "Assistant:" and any extra whitespace
+        doc = doc.strip().lstrip("Assistant:").strip()
+        yield AdditionalOutputs(doc)
+
+    except requests.RequestException as e:
+        # Handle API errors gracefully
+        error_message = "Error: Could not connect to ollama. Please ensure it's running and qwen2.5 is loaded."
+        print(f"API Error: {e}")
+        yield AdditionalOutputs(error_message)
+
+
+doc = gr.Textbox(value="", label="Current Document")
+
+stream = Stream(
+    ReplyOnPause(edit),
+    modality="audio",
+    mode="send",
+    additional_inputs=[doc],
+    additional_outputs=[doc],
+    additional_outputs_handler=lambda prev, current: current,
+    ui_args={"title": "Voice Text Editor with FastRTC 🗣️"},
+)
+
+if __name__ == "__main__":
+    if (mode := os.getenv("MODE")) == "UI":
+        stream.ui.launch(server_port=7860)
+    elif mode == "PHONE":
+        stream.fastphone(host="0.0.0.0", port=7860)
+    else:
+        stream.ui.launch(server_port=7860)