diff --git a/demo/talk_to_smolagents/app.py b/demo/talk_to_smolagents/app.py index 598351f..0638c7b 100644 --- a/demo/talk_to_smolagents/app.py +++ b/demo/talk_to_smolagents/app.py @@ -1,15 +1,15 @@ from pathlib import Path -from typing import List, Dict +from typing import Dict, List from dotenv import load_dotenv from fastrtc import ( + ReplyOnPause, + Stream, get_stt_model, get_tts_model, - Stream, - ReplyOnPause, get_twilio_turn_credentials, ) -from smolagents import CodeAgent, HfApiModel, DuckDuckGoSearchTool +from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel # Load environment variables load_dotenv() diff --git a/demo/voice_text_editor_local/app.py b/demo/voice_text_editor_local/app.py new file mode 100644 index 0000000..1e6e61f --- /dev/null +++ b/demo/voice_text_editor_local/app.py @@ -0,0 +1,126 @@ +import os + +import gradio as gr +import requests +from dotenv import load_dotenv +from fastrtc import AdditionalOutputs, ReplyOnPause, Stream, get_stt_model + +load_dotenv() + +stt_model = get_stt_model() + +SYSTEM_PROMPT = """You are an intelligent voice-activated text editor assistant. Your purpose is to help users create and modify text documents through voice commands. + +For each interaction: +1. You will receive the current state of a text document and a voice input from the user. +2. Determine if the input is: + a) A command to modify the document (e.g., "delete the last line", "capitalize that") + b) Content to be added to the document (e.g., "buy 12 eggs at the store") + c) A modification to existing content (e.g., "actually make that 24" to change "12" to "24") +3. Return ONLY the new document state after the changes have been applied. + +Example: + +CURRENT DOCUMENT: + +Meeting notes: +- Buy GPUs +- Meet with Joe + +USER INPUT: Make that 100 GPUS + +NEW DOCUMENT STATE: + +Meeting notes: +- Buy 100 GPUs +- Meet with Joe + +Example 2: + +CURRENT DOCUMENT: + +Project Proposal + +USER INPUT: Make that a header + +NEW DOCUMENT STATE: + +# Project Proposal + +When handling commands: +- Apply the requested changes precisely to the document +- Support operations like adding, deleting, modifying, and moving text +- Understand contextual references like "that", "the last line", "the second paragraph" + +When handling content additions: +- Add the new text at the appropriate location (usually at the end or cursor position) +- Format it appropriately based on the document context +- If the user says to "add" or "insert" do not remove text that was already in the document. + +When handling content modifications: +- Identify what part of the document the user is referring to +- Apply the requested change while preserving the rest of the content +- Be smart about contextual references (e.g., "make that 24" should know to replace a number) + +NEVER include any text in the new document state that is not part of the user's input. +NEVER include the phrase "CURRENT DOCUMENT" in the new document state. +NEVER reword the user's input unless you are explicitly asked to do so. +""" + + +def edit(audio, current_document: str): + prompt = stt_model.stt(audio) + print(f"Prompt: {prompt}") + + # Construct the prompt for ollama + full_prompt = ( + f"{SYSTEM_PROMPT}\n\n" + f"User: CURRENT DOCUMENT:\n\n{current_document}\n\nUSER INPUT: {prompt}\n\n" + f"Assistant:" + ) + + try: + # Send request to ollama's API + response = requests.post( + "http://localhost:11434/api/generate", + json={ + "model": "qwen2.5", + "prompt": full_prompt, + "stream": False, + "max_tokens": 200, + }, + ) + response.raise_for_status() # Raise an exception for bad status codes + + # Parse the response + doc = response.json()["response"] + # Clean up the response to remove "Assistant:" and any extra whitespace + doc = doc.strip().lstrip("Assistant:").strip() + yield AdditionalOutputs(doc) + + except requests.RequestException as e: + # Handle API errors gracefully + error_message = "Error: Could not connect to ollama. Please ensure it's running and qwen2.5 is loaded." + print(f"API Error: {e}") + yield AdditionalOutputs(error_message) + + +doc = gr.Textbox(value="", label="Current Document") + +stream = Stream( + ReplyOnPause(edit), + modality="audio", + mode="send", + additional_inputs=[doc], + additional_outputs=[doc], + additional_outputs_handler=lambda prev, current: current, + ui_args={"title": "Voice Text Editor with FastRTC 🗣️"}, +) + +if __name__ == "__main__": + if (mode := os.getenv("MODE")) == "UI": + stream.ui.launch(server_port=7860) + elif mode == "PHONE": + stream.fastphone(host="0.0.0.0", port=7860) + else: + stream.ui.launch(server_port=7860)