Added Script to use Ollama and run locally (#111)

* Uses Ollama / Qwen 2.5 to run voice activity locally

* git push origin test

* format

---------

Co-authored-by: easytop <jgwoodward@ymail.com>
Co-authored-by: Freddy Boulton <freddyboulton@hf-freddy.local>
This commit is contained in:
EasyTop
2025-03-02 15:10:31 -08:00
committed by GitHub
parent e6193b0d05
commit 25c2152304
2 changed files with 130 additions and 4 deletions

View File

@@ -1,15 +1,15 @@
from pathlib import Path
from typing import List, Dict
from typing import Dict, List
from dotenv import load_dotenv
from fastrtc import (
ReplyOnPause,
Stream,
get_stt_model,
get_tts_model,
Stream,
ReplyOnPause,
get_twilio_turn_credentials,
)
from smolagents import CodeAgent, HfApiModel, DuckDuckGoSearchTool
from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel
# Load environment variables
load_dotenv()

View File

@@ -0,0 +1,126 @@
import os
import gradio as gr
import requests
from dotenv import load_dotenv
from fastrtc import AdditionalOutputs, ReplyOnPause, Stream, get_stt_model
load_dotenv()
stt_model = get_stt_model()
SYSTEM_PROMPT = """You are an intelligent voice-activated text editor assistant. Your purpose is to help users create and modify text documents through voice commands.
For each interaction:
1. You will receive the current state of a text document and a voice input from the user.
2. Determine if the input is:
a) A command to modify the document (e.g., "delete the last line", "capitalize that")
b) Content to be added to the document (e.g., "buy 12 eggs at the store")
c) A modification to existing content (e.g., "actually make that 24" to change "12" to "24")
3. Return ONLY the new document state after the changes have been applied.
Example:
CURRENT DOCUMENT:
Meeting notes:
- Buy GPUs
- Meet with Joe
USER INPUT: Make that 100 GPUS
NEW DOCUMENT STATE:
Meeting notes:
- Buy 100 GPUs
- Meet with Joe
Example 2:
CURRENT DOCUMENT:
Project Proposal
USER INPUT: Make that a header
NEW DOCUMENT STATE:
# Project Proposal
When handling commands:
- Apply the requested changes precisely to the document
- Support operations like adding, deleting, modifying, and moving text
- Understand contextual references like "that", "the last line", "the second paragraph"
When handling content additions:
- Add the new text at the appropriate location (usually at the end or cursor position)
- Format it appropriately based on the document context
- If the user says to "add" or "insert" do not remove text that was already in the document.
When handling content modifications:
- Identify what part of the document the user is referring to
- Apply the requested change while preserving the rest of the content
- Be smart about contextual references (e.g., "make that 24" should know to replace a number)
NEVER include any text in the new document state that is not part of the user's input.
NEVER include the phrase "CURRENT DOCUMENT" in the new document state.
NEVER reword the user's input unless you are explicitly asked to do so.
"""
def edit(audio, current_document: str):
prompt = stt_model.stt(audio)
print(f"Prompt: {prompt}")
# Construct the prompt for ollama
full_prompt = (
f"{SYSTEM_PROMPT}\n\n"
f"User: CURRENT DOCUMENT:\n\n{current_document}\n\nUSER INPUT: {prompt}\n\n"
f"Assistant:"
)
try:
# Send request to ollama's API
response = requests.post(
"http://localhost:11434/api/generate",
json={
"model": "qwen2.5",
"prompt": full_prompt,
"stream": False,
"max_tokens": 200,
},
)
response.raise_for_status() # Raise an exception for bad status codes
# Parse the response
doc = response.json()["response"]
# Clean up the response to remove "Assistant:" and any extra whitespace
doc = doc.strip().lstrip("Assistant:").strip()
yield AdditionalOutputs(doc)
except requests.RequestException as e:
# Handle API errors gracefully
error_message = "Error: Could not connect to ollama. Please ensure it's running and qwen2.5 is loaded."
print(f"API Error: {e}")
yield AdditionalOutputs(error_message)
doc = gr.Textbox(value="", label="Current Document")
stream = Stream(
ReplyOnPause(edit),
modality="audio",
mode="send",
additional_inputs=[doc],
additional_outputs=[doc],
additional_outputs_handler=lambda prev, current: current,
ui_args={"title": "Voice Text Editor with FastRTC 🗣️"},
)
if __name__ == "__main__":
if (mode := os.getenv("MODE")) == "UI":
stream.ui.launch(server_port=7860)
elif mode == "PHONE":
stream.fastphone(host="0.0.0.0", port=7860)
else:
stream.ui.launch(server_port=7860)