mirror of
https://github.com/HumanAIGC-Engineering/gradio-webrtc.git
synced 2026-02-05 18:09:23 +08:00
sync code of fastrtc, add text support through datachannel, fix safari connect problem support chat without camera or mic
100 lines
2.8 KiB
Python
100 lines
2.8 KiB
Python
from pathlib import Path
|
||
from typing import Dict, List
|
||
|
||
from dotenv import load_dotenv
|
||
from fastrtc import (
|
||
ReplyOnPause,
|
||
Stream,
|
||
get_stt_model,
|
||
get_tts_model,
|
||
get_twilio_turn_credentials,
|
||
)
|
||
from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel
|
||
|
||
# Load environment variables
|
||
load_dotenv()
|
||
|
||
# Initialize file paths
|
||
curr_dir = Path(__file__).parent
|
||
|
||
# Initialize models
|
||
stt_model = get_stt_model()
|
||
tts_model = get_tts_model()
|
||
|
||
# Conversation state to maintain history
|
||
conversation_state: List[Dict[str, str]] = []
|
||
|
||
# System prompt for agent
|
||
system_prompt = """You are a helpful assistant that can helps with finding places to
|
||
workremotely from. You should specifically check against reviews and ratings of the
|
||
place. You should use this criteria to find the best place to work from:
|
||
- Price
|
||
- Reviews
|
||
- Ratings
|
||
- Location
|
||
- WIFI
|
||
Only return the name, address of the place, and a short description of the place.
|
||
Always search for real places.
|
||
Only return real places, not fake ones.
|
||
If you receive anything other than a location, you should ask for a location.
|
||
<example>
|
||
User: I am in Paris, France. Can you find me a place to work from?
|
||
Assistant: I found a place called "Le Café de la Paix" at 123 Rue de la Paix,
|
||
Paris, France. It has good reviews and is in a great location.
|
||
</example>
|
||
<example>
|
||
User: I am in London, UK. Can you find me a place to work from?
|
||
Assistant: I found a place called "The London Coffee Company".
|
||
</example>
|
||
<example>
|
||
User: How many people are in the room?
|
||
Assistant: I only respond to requests about finding places to work from.
|
||
</example>
|
||
|
||
"""
|
||
|
||
model = HfApiModel(provider="together", model="Qwen/Qwen2.5-Coder-32B-Instruct")
|
||
|
||
agent = CodeAgent(
|
||
tools=[
|
||
DuckDuckGoSearchTool(),
|
||
],
|
||
model=model,
|
||
max_steps=10,
|
||
verbosity_level=2,
|
||
description="Search the web for cafes to work from.",
|
||
)
|
||
|
||
|
||
def process_response(audio):
|
||
"""Process audio input and generate LLM response with TTS"""
|
||
# Convert speech to text using STT model
|
||
text = stt_model.stt(audio)
|
||
if not text.strip():
|
||
return
|
||
|
||
input_text = f"{system_prompt}\n\n{text}"
|
||
# Get response from agent
|
||
response_content = agent.run(input_text)
|
||
|
||
# Convert response to audio using TTS model
|
||
for audio_chunk in tts_model.stream_tts_sync(response_content or ""):
|
||
# Yield the audio chunk
|
||
yield audio_chunk
|
||
|
||
|
||
stream = Stream(
|
||
handler=ReplyOnPause(process_response, input_sample_rate=16000),
|
||
modality="audio",
|
||
mode="send-receive",
|
||
ui_args={
|
||
"pulse_color": "rgb(255, 255, 255)",
|
||
"icon_button_color": "rgb(255, 255, 255)",
|
||
"title": "🧑💻The Coworking Agent",
|
||
},
|
||
rtc_configuration=get_twilio_turn_credentials(),
|
||
)
|
||
|
||
if __name__ == "__main__":
|
||
stream.ui.launch(server_port=7860)
|