mirror of
https://github.com/HumanAIGC-Engineering/gradio-webrtc.git
synced 2026-02-04 09:29:23 +08:00
Added Script to use Ollama and run locally (#111)
* Uses Ollama / Qwen 2.5 to run voice activity locally * git push origin test * format --------- Co-authored-by: easytop <jgwoodward@ymail.com> Co-authored-by: Freddy Boulton <freddyboulton@hf-freddy.local>
This commit is contained in:
@@ -1,15 +1,15 @@
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
from typing import Dict, List
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from fastrtc import (
|
||||
ReplyOnPause,
|
||||
Stream,
|
||||
get_stt_model,
|
||||
get_tts_model,
|
||||
Stream,
|
||||
ReplyOnPause,
|
||||
get_twilio_turn_credentials,
|
||||
)
|
||||
from smolagents import CodeAgent, HfApiModel, DuckDuckGoSearchTool
|
||||
from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
126
demo/voice_text_editor_local/app.py
Normal file
126
demo/voice_text_editor_local/app.py
Normal file
@@ -0,0 +1,126 @@
|
||||
import os
|
||||
|
||||
import gradio as gr
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
from fastrtc import AdditionalOutputs, ReplyOnPause, Stream, get_stt_model
|
||||
|
||||
load_dotenv()
|
||||
|
||||
stt_model = get_stt_model()
|
||||
|
||||
SYSTEM_PROMPT = """You are an intelligent voice-activated text editor assistant. Your purpose is to help users create and modify text documents through voice commands.
|
||||
|
||||
For each interaction:
|
||||
1. You will receive the current state of a text document and a voice input from the user.
|
||||
2. Determine if the input is:
|
||||
a) A command to modify the document (e.g., "delete the last line", "capitalize that")
|
||||
b) Content to be added to the document (e.g., "buy 12 eggs at the store")
|
||||
c) A modification to existing content (e.g., "actually make that 24" to change "12" to "24")
|
||||
3. Return ONLY the new document state after the changes have been applied.
|
||||
|
||||
Example:
|
||||
|
||||
CURRENT DOCUMENT:
|
||||
|
||||
Meeting notes:
|
||||
- Buy GPUs
|
||||
- Meet with Joe
|
||||
|
||||
USER INPUT: Make that 100 GPUS
|
||||
|
||||
NEW DOCUMENT STATE:
|
||||
|
||||
Meeting notes:
|
||||
- Buy 100 GPUs
|
||||
- Meet with Joe
|
||||
|
||||
Example 2:
|
||||
|
||||
CURRENT DOCUMENT:
|
||||
|
||||
Project Proposal
|
||||
|
||||
USER INPUT: Make that a header
|
||||
|
||||
NEW DOCUMENT STATE:
|
||||
|
||||
# Project Proposal
|
||||
|
||||
When handling commands:
|
||||
- Apply the requested changes precisely to the document
|
||||
- Support operations like adding, deleting, modifying, and moving text
|
||||
- Understand contextual references like "that", "the last line", "the second paragraph"
|
||||
|
||||
When handling content additions:
|
||||
- Add the new text at the appropriate location (usually at the end or cursor position)
|
||||
- Format it appropriately based on the document context
|
||||
- If the user says to "add" or "insert" do not remove text that was already in the document.
|
||||
|
||||
When handling content modifications:
|
||||
- Identify what part of the document the user is referring to
|
||||
- Apply the requested change while preserving the rest of the content
|
||||
- Be smart about contextual references (e.g., "make that 24" should know to replace a number)
|
||||
|
||||
NEVER include any text in the new document state that is not part of the user's input.
|
||||
NEVER include the phrase "CURRENT DOCUMENT" in the new document state.
|
||||
NEVER reword the user's input unless you are explicitly asked to do so.
|
||||
"""
|
||||
|
||||
|
||||
def edit(audio, current_document: str):
|
||||
prompt = stt_model.stt(audio)
|
||||
print(f"Prompt: {prompt}")
|
||||
|
||||
# Construct the prompt for ollama
|
||||
full_prompt = (
|
||||
f"{SYSTEM_PROMPT}\n\n"
|
||||
f"User: CURRENT DOCUMENT:\n\n{current_document}\n\nUSER INPUT: {prompt}\n\n"
|
||||
f"Assistant:"
|
||||
)
|
||||
|
||||
try:
|
||||
# Send request to ollama's API
|
||||
response = requests.post(
|
||||
"http://localhost:11434/api/generate",
|
||||
json={
|
||||
"model": "qwen2.5",
|
||||
"prompt": full_prompt,
|
||||
"stream": False,
|
||||
"max_tokens": 200,
|
||||
},
|
||||
)
|
||||
response.raise_for_status() # Raise an exception for bad status codes
|
||||
|
||||
# Parse the response
|
||||
doc = response.json()["response"]
|
||||
# Clean up the response to remove "Assistant:" and any extra whitespace
|
||||
doc = doc.strip().lstrip("Assistant:").strip()
|
||||
yield AdditionalOutputs(doc)
|
||||
|
||||
except requests.RequestException as e:
|
||||
# Handle API errors gracefully
|
||||
error_message = "Error: Could not connect to ollama. Please ensure it's running and qwen2.5 is loaded."
|
||||
print(f"API Error: {e}")
|
||||
yield AdditionalOutputs(error_message)
|
||||
|
||||
|
||||
doc = gr.Textbox(value="", label="Current Document")
|
||||
|
||||
stream = Stream(
|
||||
ReplyOnPause(edit),
|
||||
modality="audio",
|
||||
mode="send",
|
||||
additional_inputs=[doc],
|
||||
additional_outputs=[doc],
|
||||
additional_outputs_handler=lambda prev, current: current,
|
||||
ui_args={"title": "Voice Text Editor with FastRTC 🗣️"},
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
if (mode := os.getenv("MODE")) == "UI":
|
||||
stream.ui.launch(server_port=7860)
|
||||
elif mode == "PHONE":
|
||||
stream.fastphone(host="0.0.0.0", port=7860)
|
||||
else:
|
||||
stream.ui.launch(server_port=7860)
|
||||
Reference in New Issue
Block a user