diff --git a/demo/talk_to_smolagents/README.md b/demo/talk_to_smolagents/README.md new file mode 100644 index 0000000..8a3b806 --- /dev/null +++ b/demo/talk_to_smolagents/README.md @@ -0,0 +1,98 @@ +--- +title: Talk to Smolagents +emoji: 💻 +colorFrom: purple +colorTo: red +sdk: gradio +sdk_version: 5.16.0 +app_file: app.py +pinned: false +license: mit +short_description: FastRTC Voice Agent with smolagents +tags: [webrtc, websocket, gradio, secret|HF_TOKEN] +--- + +# Voice LLM Agent with Image Generation + +A voice-enabled AI assistant powered by FastRTC that can: +1. Stream audio in real-time using WebRTC +2. Listen and respond with natural pauses in conversation +3. Generate images based on your requests +4. Maintain conversation context across exchanges + +This app combines the real-time communication capabilities of FastRTC with the powerful agent framework of smolagents. + +## Key Features + +- **Real-time Streaming**: Uses FastRTC's WebRTC-based audio streaming +- **Voice Activation**: Automatic detection of speech pauses to trigger responses +- **Multi-modal Interaction**: Combines voice and image generation in a single interface + +## Setup + +1. Install Python 3.9+ and create a virtual environment: + ```bash + python -m venv .venv + source .venv/bin/activate # On Windows: .venv\Scripts\activate + ``` + +2. Install dependencies: + ```bash + pip install -r requirements.txt + ``` + +3. Create a `.env` file with the following: + ``` + HF_TOKEN=your_huggingface_api_key + MODE=UI # Use 'UI' for Gradio interface, leave blank for HTML interface + ``` + +## Running the App + +### With Gradio UI (Recommended) + +```bash +MODE=UI python app.py +``` + +This launches a Gradio UI at http://localhost:7860 with: +- FastRTC's built-in streaming audio components +- A chat interface showing the conversation +- An image display panel for generated images + +## How to Use + +1. Click the microphone button to start streaming your voice. +2. Speak naturally - the app will automatically detect when you pause. +3. Ask the agent to generate an image, for example: + - "Create an image of a magical forest with glowing mushrooms." + - "Generate a picture of a futuristic city with flying cars." +4. View the generated image and hear the agent's response. + +## Technical Architecture + +### FastRTC Components + +- **Stream**: Core component that handles WebRTC connections and audio streaming +- **ReplyOnPause**: Detects when the user stops speaking to trigger a response +- **get_stt_model/get_tts_model**: Provides optimized speech-to-text and text-to-speech models + +### smolagents Components + +- **CodeAgent**: Intelligent agent that can use tools based on natural language inputs +- **Tool.from_space**: Integration with Hugging Face Spaces for image generation +- **HfApiModel**: Connection to powerful language models for understanding requests + +### Integration Flow + +1. FastRTC streams and processes audio input in real-time +2. Speech is converted to text and passed to the smolagents CodeAgent +3. The agent processes the request and calls tools when needed +4. Responses and generated images are streamed back through FastRTC +5. The UI updates to show both text responses and generated images + +## Advanced Features + +- Conversation history is maintained across exchanges +- Error handling ensures the app continues working even if agent processing fails +- The application leverages FastRTC's streaming capabilities for efficient audio transmission \ No newline at end of file diff --git a/demo/talk_to_smolagents/app.py b/demo/talk_to_smolagents/app.py new file mode 100644 index 0000000..598351f --- /dev/null +++ b/demo/talk_to_smolagents/app.py @@ -0,0 +1,99 @@ +from pathlib import Path +from typing import List, Dict + +from dotenv import load_dotenv +from fastrtc import ( + get_stt_model, + get_tts_model, + Stream, + ReplyOnPause, + get_twilio_turn_credentials, +) +from smolagents import CodeAgent, HfApiModel, DuckDuckGoSearchTool + +# Load environment variables +load_dotenv() + +# Initialize file paths +curr_dir = Path(__file__).parent + +# Initialize models +stt_model = get_stt_model() +tts_model = get_tts_model() + +# Conversation state to maintain history +conversation_state: List[Dict[str, str]] = [] + +# System prompt for agent +system_prompt = """You are a helpful assistant that can helps with finding places to +workremotely from. You should specifically check against reviews and ratings of the +place. You should use this criteria to find the best place to work from: +- Price +- Reviews +- Ratings +- Location +- WIFI +Only return the name, address of the place, and a short description of the place. +Always search for real places. +Only return real places, not fake ones. +If you receive anything other than a location, you should ask for a location. + +User: I am in Paris, France. Can you find me a place to work from? +Assistant: I found a place called "Le Café de la Paix" at 123 Rue de la Paix, +Paris, France. It has good reviews and is in a great location. + + +User: I am in London, UK. Can you find me a place to work from? +Assistant: I found a place called "The London Coffee Company". + + +User: How many people are in the room? +Assistant: I only respond to requests about finding places to work from. + + +""" + +model = HfApiModel(provider="together", model="Qwen/Qwen2.5-Coder-32B-Instruct") + +agent = CodeAgent( + tools=[ + DuckDuckGoSearchTool(), + ], + model=model, + max_steps=10, + verbosity_level=2, + description="Search the web for cafes to work from.", +) + + +def process_response(audio): + """Process audio input and generate LLM response with TTS""" + # Convert speech to text using STT model + text = stt_model.stt(audio) + if not text.strip(): + return + + input_text = f"{system_prompt}\n\n{text}" + # Get response from agent + response_content = agent.run(input_text) + + # Convert response to audio using TTS model + for audio_chunk in tts_model.stream_tts_sync(response_content or ""): + # Yield the audio chunk + yield audio_chunk + + +stream = Stream( + handler=ReplyOnPause(process_response, input_sample_rate=16000), + modality="audio", + mode="send-receive", + ui_args={ + "pulse_color": "rgb(255, 255, 255)", + "icon_button_color": "rgb(255, 255, 255)", + "title": "🧑‍💻The Coworking Agent", + }, + rtc_configuration=get_twilio_turn_credentials(), +) + +if __name__ == "__main__": + stream.ui.launch(server_port=7860) diff --git a/demo/talk_to_smolagents/requirements.txt b/demo/talk_to_smolagents/requirements.txt new file mode 100644 index 0000000..4293ca6 --- /dev/null +++ b/demo/talk_to_smolagents/requirements.txt @@ -0,0 +1,136 @@ +# This file was autogenerated by uv via the following command: +# uv export --format requirements-txt --no-hashes +aiofiles==23.2.1 +aiohappyeyeballs==2.4.6 +aiohttp==3.11.13 +aiohttp-retry==2.9.1 +aioice==0.9.0 +aiortc==1.10.1 +aiosignal==1.3.2 +annotated-types==0.7.0 +anyio==4.8.0 +async-timeout==5.0.1 ; python_full_version < '3.11' +attrs==25.1.0 +audioop-lts==0.2.1 ; python_full_version >= '3.13' +audioread==3.0.1 +av==13.1.0 +babel==2.17.0 +beautifulsoup4==4.13.3 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +click==8.1.8 +colorama==0.4.6 +coloredlogs==15.0.1 +colorlog==6.9.0 +cryptography==44.0.1 +csvw==3.5.1 +decorator==5.2.1 +dlinfo==2.0.0 +dnspython==2.7.0 +duckduckgo-search==7.5.0 +espeakng-loader==0.2.4 +exceptiongroup==1.2.2 ; python_full_version < '3.11' +fastapi==0.115.8 +fastrtc==0.0.8.post1 +fastrtc-moonshine-onnx==20241016 +ffmpy==0.5.0 +filelock==3.17.0 +flatbuffers==25.2.10 +frozenlist==1.5.0 +fsspec==2025.2.0 +google-crc32c==1.6.0 +gradio==5.19.0 +gradio-client==1.7.2 +h11==0.14.0 +httpcore==1.0.7 +httpx==0.28.1 +huggingface-hub==0.29.1 +humanfriendly==10.0 +idna==3.10 +ifaddr==0.2.0 +isodate==0.7.2 +jinja2==3.1.5 +joblib==1.4.2 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kokoro-onnx==0.4.3 +language-tags==1.2.0 +lazy-loader==0.4 +librosa==0.10.2.post1 +llvmlite==0.44.0 +lxml==5.3.1 +markdown-it-py==3.0.0 +markdownify==1.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.1.0 +multidict==6.1.0 +numba==0.61.0 +numpy==2.1.3 +onnxruntime==1.20.1 +orjson==3.10.15 +packaging==24.2 +pandas==2.2.3 +phonemizer-fork==3.3.1 +pillow==11.1.0 +platformdirs==4.3.6 +pooch==1.8.2 +primp==0.14.0 +propcache==0.3.0 +protobuf==5.29.3 +pycparser==2.22 +pydantic==2.10.6 +pydantic-core==2.27.2 +pydub==0.25.1 +pyee==12.1.1 +pygments==2.19.1 +pyjwt==2.10.1 +pylibsrtp==0.11.0 +pyopenssl==25.0.0 +pyparsing==3.2.1 +pyreadline3==3.5.4 ; sys_platform == 'win32' +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-multipart==0.0.20 +pytz==2025.1 +pyyaml==6.0.2 +rdflib==7.1.3 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +rfc3986==1.5.0 +rich==13.9.4 +rpds-py==0.23.1 +ruff==0.9.7 ; sys_platform != 'emscripten' +safehttpx==0.1.6 +scikit-learn==1.6.1 +scipy==1.15.2 +segments==2.3.0 +semantic-version==2.10.0 +shellingham==1.5.4 ; sys_platform != 'emscripten' +six==1.17.0 +smolagents==1.9.2 +sniffio==1.3.1 +soundfile==0.13.1 +soupsieve==2.6 +soxr==0.5.0.post1 +standard-aifc==3.13.0 ; python_full_version >= '3.13' +standard-chunk==3.13.0 ; python_full_version >= '3.13' +standard-sunau==3.13.0 ; python_full_version >= '3.13' +starlette==0.45.3 +sympy==1.13.3 +threadpoolctl==3.5.0 +tokenizers==0.21.0 +tomlkit==0.13.2 +tqdm==4.67.1 +twilio==9.4.6 +typer==0.15.1 ; sys_platform != 'emscripten' +typing-extensions==4.12.2 +tzdata==2025.1 +uritemplate==4.1.1 +urllib3==2.3.0 +uvicorn==0.34.0 ; sys_platform != 'emscripten' +websockets==15.0 +yarl==1.18.3 diff --git a/docs/cookbook.md b/docs/cookbook.md index b510f48..64d5d59 100644 --- a/docs/cookbook.md +++ b/docs/cookbook.md @@ -33,6 +33,7 @@ A collection of applications built with FastRTC. Click on the tags below to find +