Spaces:
Running
Running
Amir Mahla
commited on
Commit
Β·
c9554cf
1
Parent(s):
af1ae43
MOCK backend
Browse files- cua2-core/pyproject.toml +1 -3
- cua2-core/src/cua2-core/models/models.py +0 -95
- cua2-core/src/cua2-core/services/agent_service.py +0 -130
- cua2-core/src/cua2-core/services/agents/get_agents.py +0 -57
- cua2-core/src/cua2-core/services/agents/normalized_1000_agent.py +0 -293
- cua2-core/src/cua2-core/services/agents/normalized_agent.py +0 -282
- cua2-core/src/cua2-core/services/agents/pixel_coordonates_agent.py +0 -317
- cua2-core/src/cua2-core/services/agents/prompt.py +0 -548
- cua2-core/src/cua2-core/services/models/anthropic.py +0 -10
- cua2-core/src/cua2-core/services/models/gemini.py +0 -0
- cua2-core/src/cua2-core/services/models/get_model.py +0 -12
- cua2-core/src/cua2-core/services/models/qwen.py +0 -0
- cua2-core/src/{cua2-core β cua2_core}/__init__.py +0 -0
- cua2-core/src/{cua2-core β cua2_core}/app.py +2 -2
- cua2-core/src/{cua2-core β cua2_core}/main.py +7 -7
- cua2-core/src/cua2_core/models/__init__.py +2 -0
- cua2-core/src/cua2_core/models/models.py +221 -0
- cua2-core/src/cua2_core/routes/__init__.py +2 -0
- cua2-core/src/{cua2-core β cua2_core}/routes/routes.py +3 -3
- cua2-core/src/{cua2-core β cua2_core}/routes/websocket.py +35 -30
- cua2-core/src/cua2_core/services/__init__.py +2 -0
- cua2-core/src/cua2_core/services/agent_service.py +172 -0
- cua2-core/src/cua2_core/services/simulation_metadata/simulated_trace.json +62 -0
- cua2-core/src/cua2_core/websocket/__init__.py +2 -0
- cua2-core/src/{cua2-core β cua2_core}/websocket/websocket_manager.py +4 -4
cua2-core/pyproject.toml
CHANGED
|
@@ -6,7 +6,6 @@ build-backend = "hatchling.build"
|
|
| 6 |
name = "cua2-core"
|
| 7 |
version = "0.0.0-dev.0"
|
| 8 |
description = "Backend API server for Computer Use Agent"
|
| 9 |
-
readme = "README.md"
|
| 10 |
authors = [{ name = "Amir Mahla", email = "[email protected]" }]
|
| 11 |
keywords = ["fastapi", "api", "backend", "automation"]
|
| 12 |
classifiers = [
|
|
@@ -61,12 +60,11 @@ Homepage = "https://github.com/huggingface/CUA2"
|
|
| 61 |
Repository = "https://github.com/huggingface/CUA2"
|
| 62 |
|
| 63 |
[tool.hatch.build.targets.wheel]
|
| 64 |
-
packages = ["src/
|
| 65 |
|
| 66 |
[tool.hatch.build.targets.sdist]
|
| 67 |
include = [
|
| 68 |
"/src",
|
| 69 |
-
"/README.md",
|
| 70 |
]
|
| 71 |
|
| 72 |
[tool.coverage.run]
|
|
|
|
| 6 |
name = "cua2-core"
|
| 7 |
version = "0.0.0-dev.0"
|
| 8 |
description = "Backend API server for Computer Use Agent"
|
|
|
|
| 9 |
authors = [{ name = "Amir Mahla", email = "[email protected]" }]
|
| 10 |
keywords = ["fastapi", "api", "backend", "automation"]
|
| 11 |
classifiers = [
|
|
|
|
| 60 |
Repository = "https://github.com/huggingface/CUA2"
|
| 61 |
|
| 62 |
[tool.hatch.build.targets.wheel]
|
| 63 |
+
packages = ["src/cua2_core"]
|
| 64 |
|
| 65 |
[tool.hatch.build.targets.sdist]
|
| 66 |
include = [
|
| 67 |
"/src",
|
|
|
|
| 68 |
]
|
| 69 |
|
| 70 |
[tool.coverage.run]
|
cua2-core/src/cua2-core/models/models.py
DELETED
|
@@ -1,95 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import os
|
| 3 |
-
from datetime import datetime
|
| 4 |
-
from enum import Enum
|
| 5 |
-
from typing import Literal, Optional
|
| 6 |
-
|
| 7 |
-
from pydantic import BaseModel, model_validator
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
class AgentMetadata(BaseModel):
|
| 11 |
-
"""Metadata for agent execution"""
|
| 12 |
-
|
| 13 |
-
inputTokensUsed: int
|
| 14 |
-
outputTokensUsed: int
|
| 15 |
-
timeTaken: float # in seconds
|
| 16 |
-
numberOfSteps: int
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
class AgentType(str, Enum):
|
| 20 |
-
"""Agent type"""
|
| 21 |
-
|
| 22 |
-
PIXEL_COORDINATES = "pixel_coordinates"
|
| 23 |
-
NORMALIZED_1000_COORDINATES = "normalized_1000_coordinates"
|
| 24 |
-
NORMALIZED_COORDINATES = "normalized_coordinates"
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
class ActiveTask(BaseModel):
|
| 28 |
-
"""Active task"""
|
| 29 |
-
|
| 30 |
-
message_id: str
|
| 31 |
-
content: str
|
| 32 |
-
model_id: str
|
| 33 |
-
start_time: datetime
|
| 34 |
-
status: str
|
| 35 |
-
|
| 36 |
-
@property
|
| 37 |
-
def trace_path(self):
|
| 38 |
-
"""Trace path"""
|
| 39 |
-
return f"data/trace-{self.message_id}-{self.model_id}"
|
| 40 |
-
|
| 41 |
-
@model_validator(mode="after")
|
| 42 |
-
def validate_model_id(self):
|
| 43 |
-
"""Validate model ID"""
|
| 44 |
-
os.makedirs(self.trace_path, exist_ok=True)
|
| 45 |
-
with open(f"{self.trace_path}/user_tasks.json", "w") as f:
|
| 46 |
-
json.dump(self.model_dump(mode="json"), f, indent=2)
|
| 47 |
-
|
| 48 |
-
return self
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
class WebSocketEvent(BaseModel):
|
| 52 |
-
"""WebSocket event structure"""
|
| 53 |
-
|
| 54 |
-
type: Literal[
|
| 55 |
-
"agent_start",
|
| 56 |
-
"agent_progress",
|
| 57 |
-
"agent_complete",
|
| 58 |
-
"agent_error",
|
| 59 |
-
"vnc_url_set",
|
| 60 |
-
"vnc_url_unset",
|
| 61 |
-
"heartbeat",
|
| 62 |
-
]
|
| 63 |
-
content: Optional[str] = None
|
| 64 |
-
metadata: Optional[AgentMetadata] = None
|
| 65 |
-
messageId: Optional[str] = None
|
| 66 |
-
vncUrl: Optional[str] = None
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
class UserTaskMessage(BaseModel):
|
| 70 |
-
"""Message sent from frontend to backend"""
|
| 71 |
-
|
| 72 |
-
type: Literal["user_task"]
|
| 73 |
-
content: str
|
| 74 |
-
model_id: str
|
| 75 |
-
timestamp: str
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
class AgentMessage(BaseModel):
|
| 79 |
-
"""Agent message structure"""
|
| 80 |
-
|
| 81 |
-
id: str
|
| 82 |
-
type: Literal["user", "agent"]
|
| 83 |
-
content: str
|
| 84 |
-
timestamp: datetime
|
| 85 |
-
metadata: Optional[AgentMetadata] = None
|
| 86 |
-
isLoading: Optional[bool] = None
|
| 87 |
-
truncated: Optional[bool] = None
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
class HealthResponse(BaseModel):
|
| 91 |
-
"""Health check response"""
|
| 92 |
-
|
| 93 |
-
status: str
|
| 94 |
-
timestamp: datetime
|
| 95 |
-
websocket_connections: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cua2-core/src/cua2-core/services/agent_service.py
DELETED
|
@@ -1,130 +0,0 @@
|
|
| 1 |
-
import asyncio
|
| 2 |
-
import uuid
|
| 3 |
-
from datetime import datetime
|
| 4 |
-
from typing import Optional
|
| 5 |
-
|
| 6 |
-
from smolagents import Model
|
| 7 |
-
|
| 8 |
-
from backend.models.models import ActiveTask, AgentMetadata
|
| 9 |
-
from backend.services.agents.get_agents import get_agent
|
| 10 |
-
from backend.services.models.get_model import get_model
|
| 11 |
-
from backend.websocket.websocket_manager import WebSocketManager
|
| 12 |
-
from computer_use_studio import Sandbox
|
| 13 |
-
from computer_use_studio.logger import get_logger
|
| 14 |
-
|
| 15 |
-
logger = get_logger(__name__)
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
class AgentService:
|
| 19 |
-
"""Service for handling agent tasks and processing"""
|
| 20 |
-
|
| 21 |
-
def __init__(self, websocket_manager):
|
| 22 |
-
self.active_tasks: dict[str, ActiveTask] = {}
|
| 23 |
-
self.websocket_manager: WebSocketManager = websocket_manager
|
| 24 |
-
|
| 25 |
-
async def process_user_task(self, content: str, model_id: str) -> str:
|
| 26 |
-
"""Process a user task and return the message ID"""
|
| 27 |
-
|
| 28 |
-
message_id = str(uuid.uuid4())
|
| 29 |
-
while message_id in self.active_tasks.keys():
|
| 30 |
-
message_id = str(uuid.uuid4())
|
| 31 |
-
|
| 32 |
-
# Store the task
|
| 33 |
-
self.active_tasks[message_id] = ActiveTask(
|
| 34 |
-
message_id=message_id,
|
| 35 |
-
content=content,
|
| 36 |
-
model_id=model_id,
|
| 37 |
-
start_time=datetime.now(),
|
| 38 |
-
status="processing",
|
| 39 |
-
)
|
| 40 |
-
|
| 41 |
-
# Determine the agent type based on the content of the task (TODO: implement agent type detection using LLM)
|
| 42 |
-
prompt_type = "FORM_SYSTEM_PROMPT"
|
| 43 |
-
|
| 44 |
-
# Start the agent processing in the background
|
| 45 |
-
asyncio.create_task(
|
| 46 |
-
self._simulate_agent_processing(content, model_id, message_id, prompt_type)
|
| 47 |
-
)
|
| 48 |
-
|
| 49 |
-
return message_id
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
# async def _simulate_agent_processing(self, message_id: str, content: str):
|
| 53 |
-
# """Simulate agent processing with progress updates"""
|
| 54 |
-
# try:
|
| 55 |
-
# # Send agent start event
|
| 56 |
-
# await self.websocket_manager.send_agent_start(
|
| 57 |
-
# content=f"Starting task: {content}", message_id=message_id
|
| 58 |
-
# )
|
| 59 |
-
#
|
| 60 |
-
# # Simulate processing steps
|
| 61 |
-
# steps = [
|
| 62 |
-
# "Analyzing task requirements...",
|
| 63 |
-
# "Planning execution steps...",
|
| 64 |
-
# "Initializing computer interface...",
|
| 65 |
-
# "Executing task commands...",
|
| 66 |
-
# "Verifying results...",
|
| 67 |
-
# "Finalizing task completion...",
|
| 68 |
-
# ]
|
| 69 |
-
#
|
| 70 |
-
# for i, step in enumerate(steps):
|
| 71 |
-
# await asyncio.sleep(2) # Simulate processing time
|
| 72 |
-
#
|
| 73 |
-
# # Send progress update
|
| 74 |
-
# await self.websocket_manager.send_agent_progress(
|
| 75 |
-
# content=f"{step} ({i + 1}/{len(steps)})", message_id=message_id
|
| 76 |
-
# )
|
| 77 |
-
#
|
| 78 |
-
# # Simulate VNC URL events during processing
|
| 79 |
-
# if i == 2: # After "Initializing computer interface..."
|
| 80 |
-
# # Set VNC URL when computer interface is ready
|
| 81 |
-
# vnc_url = "http://localhost:6080/vnc.html?host=localhost&port=5900&autoconnect=true"
|
| 82 |
-
# await self.websocket_manager.send_vnc_url_set(
|
| 83 |
-
# vnc_url=vnc_url,
|
| 84 |
-
# content="Computer interface ready, VNC stream connected",
|
| 85 |
-
# )
|
| 86 |
-
# elif i == 4: # After "Verifying results..."
|
| 87 |
-
# # Unset VNC URL when task is almost complete
|
| 88 |
-
# await self.websocket_manager.send_vnc_url_unset(
|
| 89 |
-
# content="Task verification complete, disconnecting VNC stream"
|
| 90 |
-
# )
|
| 91 |
-
#
|
| 92 |
-
# # Calculate metadata
|
| 93 |
-
# end_time = datetime.now()
|
| 94 |
-
# start_time = self.active_tasks[message_id]["start_time"]
|
| 95 |
-
# time_taken = (end_time - start_time).total_seconds()
|
| 96 |
-
#
|
| 97 |
-
# metadata = AgentMetadata(
|
| 98 |
-
# tokensUsed=150 + len(content) * 2, # Simulate token usage
|
| 99 |
-
# timeTaken=time_taken,
|
| 100 |
-
# numberOfSteps=len(steps),
|
| 101 |
-
# )
|
| 102 |
-
#
|
| 103 |
-
# # Send completion event
|
| 104 |
-
# await self.websocket_manager.send_agent_complete(
|
| 105 |
-
# content=f"Task completed successfully: {content}",
|
| 106 |
-
# message_id=message_id,
|
| 107 |
-
# metadata=metadata,
|
| 108 |
-
# )
|
| 109 |
-
#
|
| 110 |
-
# # Clean up
|
| 111 |
-
# if message_id in self.active_tasks:
|
| 112 |
-
# del self.active_tasks[message_id]
|
| 113 |
-
#
|
| 114 |
-
# except Exception as e:
|
| 115 |
-
# # Send error event
|
| 116 |
-
# await self.websocket_manager.send_agent_error(
|
| 117 |
-
# content=f"Error processing task: {str(e)}", message_id=message_id
|
| 118 |
-
# )
|
| 119 |
-
#
|
| 120 |
-
# # Clean up
|
| 121 |
-
# if message_id in self.active_tasks:
|
| 122 |
-
# del self.active_tasks[message_id]
|
| 123 |
-
|
| 124 |
-
def get_active_tasks(self) -> dict:
|
| 125 |
-
"""Get currently active tasks"""
|
| 126 |
-
return self.active_tasks.copy()
|
| 127 |
-
|
| 128 |
-
def get_task_status(self, message_id: str) -> Optional[dict]:
|
| 129 |
-
"""Get status of a specific task"""
|
| 130 |
-
return self.active_tasks.get(message_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cua2-core/src/cua2-core/services/agents/get_agents.py
DELETED
|
@@ -1,57 +0,0 @@
|
|
| 1 |
-
from typing import Annotated, TypeAlias
|
| 2 |
-
|
| 3 |
-
from pydantic import Field
|
| 4 |
-
from smolagents import Model
|
| 5 |
-
|
| 6 |
-
from backend.models.models import AgentType
|
| 7 |
-
from backend.services.agents.normalized_1000_agent import Normalized1000Agent
|
| 8 |
-
from backend.services.agents.normalized_agent import NormalizedAgent
|
| 9 |
-
from backend.services.agents.pixel_coordonates_agent import PixelCoordinatesAgent
|
| 10 |
-
from backend.services.agents.prompt import (
|
| 11 |
-
Normalized1000CoordinatesSystemPrompt,
|
| 12 |
-
NormalizedCoordinatesSystemPrompt,
|
| 13 |
-
PixelCoordinatesSystemPrompt,
|
| 14 |
-
)
|
| 15 |
-
from computer_use_studio import Sandbox
|
| 16 |
-
|
| 17 |
-
Agent: TypeAlias = Annotated[
|
| 18 |
-
PixelCoordinatesAgent | Normalized1000Agent | NormalizedAgent,
|
| 19 |
-
Field(discriminator="AGENT_TYPE"),
|
| 20 |
-
]
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
def get_agent(
|
| 24 |
-
model: Model,
|
| 25 |
-
desktop: Sandbox,
|
| 26 |
-
agent_type: AgentType,
|
| 27 |
-
prompt_type: str,
|
| 28 |
-
data_dir: str,
|
| 29 |
-
**kwargs,
|
| 30 |
-
) -> Agent:
|
| 31 |
-
"""Get the agent by type"""
|
| 32 |
-
if agent_type == AgentType.PIXEL_COORDINATES:
|
| 33 |
-
return PixelCoordinatesAgent(
|
| 34 |
-
model=model,
|
| 35 |
-
desktop=desktop,
|
| 36 |
-
system_prompt=PixelCoordinatesSystemPrompt[prompt_type].value,
|
| 37 |
-
data_dir=data_dir,
|
| 38 |
-
**kwargs,
|
| 39 |
-
)
|
| 40 |
-
elif agent_type == AgentType.NORMALIZED_1000_COORDINATES:
|
| 41 |
-
return Normalized1000Agent(
|
| 42 |
-
model=model,
|
| 43 |
-
desktop=desktop,
|
| 44 |
-
system_prompt=Normalized1000CoordinatesSystemPrompt[prompt_type].value,
|
| 45 |
-
data_dir=data_dir,
|
| 46 |
-
**kwargs,
|
| 47 |
-
)
|
| 48 |
-
elif agent_type == AgentType.NORMALIZED_COORDINATES:
|
| 49 |
-
return Normalized1000Agent(
|
| 50 |
-
model=model,
|
| 51 |
-
desktop=desktop,
|
| 52 |
-
system_prompt=NormalizedCoordinatesSystemPrompt[prompt_type].value,
|
| 53 |
-
data_dir=data_dir,
|
| 54 |
-
**kwargs,
|
| 55 |
-
)
|
| 56 |
-
else:
|
| 57 |
-
raise ValueError(f"Invalid agent type: {agent_type}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cua2-core/src/cua2-core/services/agents/normalized_1000_agent.py
DELETED
|
@@ -1,293 +0,0 @@
|
|
| 1 |
-
import time
|
| 2 |
-
import unicodedata
|
| 3 |
-
from typing import List, Literal
|
| 4 |
-
|
| 5 |
-
# SmolaAgents imports
|
| 6 |
-
from smolagents import Model, Tool, tool
|
| 7 |
-
from smolagents.monitoring import LogLevel
|
| 8 |
-
|
| 9 |
-
from backend.models.models import AgentType
|
| 10 |
-
from backend.services.agents.prompt import Normalized1000CoordinatesSystemPrompt
|
| 11 |
-
from computer_use_studio import DesktopAgentBase, Sandbox
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
class Normalized1000Agent(DesktopAgentBase):
|
| 15 |
-
"""Agent for desktop automation with normalized coordinates (0 to 1000)"""
|
| 16 |
-
|
| 17 |
-
AGENT_TYPE = AgentType.NORMALIZED_1000_COORDINATES
|
| 18 |
-
|
| 19 |
-
def __init__(
|
| 20 |
-
self,
|
| 21 |
-
model: Model,
|
| 22 |
-
data_dir: str,
|
| 23 |
-
desktop: Sandbox,
|
| 24 |
-
system_prompt: Normalized1000CoordinatesSystemPrompt,
|
| 25 |
-
tools: List[Tool] | None = None,
|
| 26 |
-
max_steps: int = 20,
|
| 27 |
-
verbosity_level: LogLevel = LogLevel.INFO,
|
| 28 |
-
planning_interval: int | None = None,
|
| 29 |
-
use_v1_prompt: bool = False,
|
| 30 |
-
**kwargs,
|
| 31 |
-
):
|
| 32 |
-
super().__init__(
|
| 33 |
-
model=model,
|
| 34 |
-
data_dir=data_dir,
|
| 35 |
-
desktop=desktop,
|
| 36 |
-
system_prompt=system_prompt,
|
| 37 |
-
tools=tools,
|
| 38 |
-
max_steps=max_steps,
|
| 39 |
-
verbosity_level=verbosity_level,
|
| 40 |
-
planning_interval=planning_interval,
|
| 41 |
-
use_v1_prompt=use_v1_prompt,
|
| 42 |
-
**kwargs,
|
| 43 |
-
)
|
| 44 |
-
|
| 45 |
-
def _normalize_to_pixel(self, norm_x: int, norm_y: int) -> tuple[int, int]:
|
| 46 |
-
"""
|
| 47 |
-
Convert normalized coordinates (0-1000) to pixel coordinates
|
| 48 |
-
Args:
|
| 49 |
-
norm_x: Normalized x coordinate (0 to 1000)
|
| 50 |
-
norm_y: Normalized y coordinate (0 to 1000)
|
| 51 |
-
Returns:
|
| 52 |
-
Tuple of (pixel_x, pixel_y)
|
| 53 |
-
"""
|
| 54 |
-
# Clamp values to valid range
|
| 55 |
-
norm_x = max(0, min(1000, norm_x))
|
| 56 |
-
norm_y = max(0, min(1000, norm_y))
|
| 57 |
-
|
| 58 |
-
# Convert from 0-1000 range to 0-1 range, then to pixels
|
| 59 |
-
norm_x_float = norm_x / 1000.0
|
| 60 |
-
norm_y_float = norm_y / 1000.0
|
| 61 |
-
|
| 62 |
-
pixel_x = int(norm_x_float * self.width)
|
| 63 |
-
pixel_y = int(norm_y_float * self.height)
|
| 64 |
-
|
| 65 |
-
# Ensure we don't go outside screen bounds
|
| 66 |
-
pixel_x = max(0, min(self.width - 1, pixel_x))
|
| 67 |
-
pixel_y = max(0, min(self.height - 1, pixel_y))
|
| 68 |
-
|
| 69 |
-
return pixel_x, pixel_y
|
| 70 |
-
|
| 71 |
-
def _setup_desktop_tools(self):
|
| 72 |
-
"""Register all desktop tools with normalized coordinate support (0-1000)"""
|
| 73 |
-
|
| 74 |
-
@tool
|
| 75 |
-
def click(x: int, y: int) -> str:
|
| 76 |
-
"""
|
| 77 |
-
Performs a left-click at the specified normalized coordinates
|
| 78 |
-
Args:
|
| 79 |
-
x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
|
| 80 |
-
y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
|
| 81 |
-
"""
|
| 82 |
-
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 83 |
-
self.desktop.left_click(pixel_x, pixel_y)
|
| 84 |
-
self.click_coordinates = (pixel_x, pixel_y)
|
| 85 |
-
self.logger.log(
|
| 86 |
-
f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 87 |
-
)
|
| 88 |
-
time.sleep(1)
|
| 89 |
-
return f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 90 |
-
|
| 91 |
-
@tool
|
| 92 |
-
def right_click(x: int, y: int) -> str:
|
| 93 |
-
"""
|
| 94 |
-
Performs a right-click at the specified normalized coordinates
|
| 95 |
-
Args:
|
| 96 |
-
x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
|
| 97 |
-
y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
|
| 98 |
-
"""
|
| 99 |
-
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 100 |
-
self.desktop.right_click(pixel_x, pixel_y)
|
| 101 |
-
self.click_coordinates = (pixel_x, pixel_y)
|
| 102 |
-
self.logger.log(
|
| 103 |
-
f"Right-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 104 |
-
)
|
| 105 |
-
return f"Right-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 106 |
-
|
| 107 |
-
@tool
|
| 108 |
-
def double_click(x: int, y: int) -> str:
|
| 109 |
-
"""
|
| 110 |
-
Performs a double-click at the specified normalized coordinates
|
| 111 |
-
Args:
|
| 112 |
-
x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
|
| 113 |
-
y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
|
| 114 |
-
"""
|
| 115 |
-
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 116 |
-
self.desktop.double_click(pixel_x, pixel_y)
|
| 117 |
-
self.click_coordinates = (pixel_x, pixel_y)
|
| 118 |
-
self.logger.log(
|
| 119 |
-
f"Double-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 120 |
-
)
|
| 121 |
-
return f"Double-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 122 |
-
|
| 123 |
-
@tool
|
| 124 |
-
def move_mouse(x: int, y: int) -> str:
|
| 125 |
-
"""
|
| 126 |
-
Moves the mouse cursor to the specified normalized coordinates
|
| 127 |
-
Args:
|
| 128 |
-
x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
|
| 129 |
-
y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
|
| 130 |
-
"""
|
| 131 |
-
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 132 |
-
self.desktop.move_mouse(pixel_x, pixel_y)
|
| 133 |
-
self.logger.log(
|
| 134 |
-
f"Moved mouse to normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 135 |
-
)
|
| 136 |
-
return f"Moved mouse to normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 137 |
-
|
| 138 |
-
def normalize_text(text):
|
| 139 |
-
return "".join(
|
| 140 |
-
c
|
| 141 |
-
for c in unicodedata.normalize("NFD", text)
|
| 142 |
-
if not unicodedata.combining(c)
|
| 143 |
-
)
|
| 144 |
-
|
| 145 |
-
@tool
|
| 146 |
-
def write(text: str) -> str:
|
| 147 |
-
"""
|
| 148 |
-
Types the specified text at the current cursor position.
|
| 149 |
-
Args:
|
| 150 |
-
text: The text to type
|
| 151 |
-
"""
|
| 152 |
-
# clean_text = normalize_text(text)
|
| 153 |
-
self.desktop.write(text, delay_in_ms=10)
|
| 154 |
-
self.logger.log(f"Typed text: '{text}'")
|
| 155 |
-
time.sleep(1)
|
| 156 |
-
return f"Typed text: '{text}'"
|
| 157 |
-
|
| 158 |
-
@tool
|
| 159 |
-
def press(key: str) -> str:
|
| 160 |
-
"""
|
| 161 |
-
Presses a keyboard key or combination of keys
|
| 162 |
-
Args:
|
| 163 |
-
key: The key to press (e.g. "enter", "space", "backspace", etc.) or a multiple keys string to press, for example "ctrl+a" or "ctrl+shift+a".
|
| 164 |
-
"""
|
| 165 |
-
self.desktop.press(key)
|
| 166 |
-
self.logger.log(f"Pressed key: {key}")
|
| 167 |
-
time.sleep(0.1)
|
| 168 |
-
return f"Pressed key: {key}"
|
| 169 |
-
|
| 170 |
-
@tool
|
| 171 |
-
def drag(x1: int, y1: int, x2: int, y2: int) -> str:
|
| 172 |
-
"""
|
| 173 |
-
Clicks at normalized coordinates [x1, y1], drags mouse to [x2, y2], then release click.
|
| 174 |
-
Args:
|
| 175 |
-
x1: origin normalized x coordinate (0 to 1000)
|
| 176 |
-
y1: origin normalized y coordinate (0 to 1000)
|
| 177 |
-
x2: end normalized x coordinate (0 to 1000)
|
| 178 |
-
y2: end normalized y coordinate (0 to 1000)
|
| 179 |
-
"""
|
| 180 |
-
pixel_x1, pixel_y1 = self._normalize_to_pixel(x1, y1)
|
| 181 |
-
pixel_x2, pixel_y2 = self._normalize_to_pixel(x2, y2)
|
| 182 |
-
self.desktop.drag((pixel_x1, pixel_y1), (pixel_x2, pixel_y2))
|
| 183 |
-
message = f"Dragged and dropped from normalized [{x1}, {y1}] to [{x2}, {y2}] -> pixels [{pixel_x1}, {pixel_y1}] to [{pixel_x2}, {pixel_y2}]"
|
| 184 |
-
self.logger.log(message)
|
| 185 |
-
return message
|
| 186 |
-
|
| 187 |
-
@tool
|
| 188 |
-
def scroll(
|
| 189 |
-
x: int,
|
| 190 |
-
y: int,
|
| 191 |
-
direction: Literal["up", "down"] = "down",
|
| 192 |
-
amount: int = 2,
|
| 193 |
-
) -> str:
|
| 194 |
-
"""
|
| 195 |
-
Moves the mouse to selected normalized coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
|
| 196 |
-
Args:
|
| 197 |
-
x: The normalized x coordinate (0 to 1000) of the element to scroll/zoom
|
| 198 |
-
y: The normalized y coordinate (0 to 1000) of the element to scroll/zoom
|
| 199 |
-
direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
|
| 200 |
-
amount: The amount to scroll. A good amount is 1 or 2.
|
| 201 |
-
"""
|
| 202 |
-
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 203 |
-
self.desktop.move_mouse(pixel_x, pixel_y)
|
| 204 |
-
self.desktop.scroll(direction=direction, amount=amount)
|
| 205 |
-
message = f"Scrolled {direction} by {amount} at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 206 |
-
self.logger.log(message)
|
| 207 |
-
return message
|
| 208 |
-
|
| 209 |
-
@tool
|
| 210 |
-
def wait(seconds: float) -> str:
|
| 211 |
-
"""
|
| 212 |
-
Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
|
| 213 |
-
Args:
|
| 214 |
-
seconds: Number of seconds to wait, generally 3 is enough.
|
| 215 |
-
"""
|
| 216 |
-
time.sleep(seconds)
|
| 217 |
-
self.logger.log(f"Waited for {seconds} seconds")
|
| 218 |
-
return f"Waited for {seconds} seconds"
|
| 219 |
-
|
| 220 |
-
@tool
|
| 221 |
-
def open(file_or_url: str) -> str:
|
| 222 |
-
"""
|
| 223 |
-
Directly opens a browser with the specified url or opens a file with the default application: use this at start of web searches rather than trying to click the browser or open a file by clicking.
|
| 224 |
-
Args:
|
| 225 |
-
file_or_url: The URL or file to open
|
| 226 |
-
"""
|
| 227 |
-
|
| 228 |
-
self.desktop.open(file_or_url)
|
| 229 |
-
# Give it time to load
|
| 230 |
-
time.sleep(2)
|
| 231 |
-
self.logger.log(f"Opening: {file_or_url}")
|
| 232 |
-
return f"Opened: {file_or_url}"
|
| 233 |
-
|
| 234 |
-
@tool
|
| 235 |
-
def launch_app(app_name: str) -> str:
|
| 236 |
-
"""
|
| 237 |
-
Launches the specified application.
|
| 238 |
-
Args:
|
| 239 |
-
app_name: the name of the application to launch
|
| 240 |
-
"""
|
| 241 |
-
self.desktop.launch(app_name)
|
| 242 |
-
self.logger.log(f"Launched app: {app_name}")
|
| 243 |
-
return f"Launched app: {app_name}"
|
| 244 |
-
|
| 245 |
-
@tool
|
| 246 |
-
def execute(command: str) -> str:
|
| 247 |
-
"""
|
| 248 |
-
Executes a terminal command in the desktop environment.
|
| 249 |
-
Args:
|
| 250 |
-
command: The command to execute
|
| 251 |
-
"""
|
| 252 |
-
self.desktop.execute_command(command)
|
| 253 |
-
self.logger.log(f"Executed command: {command}")
|
| 254 |
-
return f"Executed command: {command}"
|
| 255 |
-
|
| 256 |
-
@tool
|
| 257 |
-
def refresh() -> str:
|
| 258 |
-
"""
|
| 259 |
-
Refreshes the current web page if you're in a browser.
|
| 260 |
-
"""
|
| 261 |
-
self.desktop.press(["ctrl", "r"])
|
| 262 |
-
self.logger.log("Refreshed the current page")
|
| 263 |
-
return "Refreshed the current page"
|
| 264 |
-
|
| 265 |
-
@tool
|
| 266 |
-
def go_back() -> str:
|
| 267 |
-
"""
|
| 268 |
-
Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
|
| 269 |
-
Args:
|
| 270 |
-
"""
|
| 271 |
-
self.desktop.press(["alt", "left"])
|
| 272 |
-
self.logger.log("Went back one page")
|
| 273 |
-
return "Went back one page"
|
| 274 |
-
|
| 275 |
-
# Register the tools
|
| 276 |
-
self.tools["click"] = click
|
| 277 |
-
self.tools["right_click"] = right_click
|
| 278 |
-
self.tools["double_click"] = double_click
|
| 279 |
-
self.tools["move_mouse"] = move_mouse
|
| 280 |
-
self.tools["write"] = write
|
| 281 |
-
self.tools["press"] = press
|
| 282 |
-
self.tools["scroll"] = scroll
|
| 283 |
-
self.tools["wait"] = wait
|
| 284 |
-
self.tools["open"] = open
|
| 285 |
-
self.tools["go_back"] = go_back
|
| 286 |
-
self.tools["drag"] = drag
|
| 287 |
-
self.tools["launch_app"] = launch_app
|
| 288 |
-
self.tools["execute"] = execute
|
| 289 |
-
self.tools["refresh"] = refresh
|
| 290 |
-
self.tools["refresh"] = refresh
|
| 291 |
-
self.tools["execute"] = execute
|
| 292 |
-
self.tools["refresh"] = refresh
|
| 293 |
-
self.tools["refresh"] = refresh
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cua2-core/src/cua2-core/services/agents/normalized_agent.py
DELETED
|
@@ -1,282 +0,0 @@
|
|
| 1 |
-
import time
|
| 2 |
-
import unicodedata
|
| 3 |
-
from typing import List, Literal
|
| 4 |
-
|
| 5 |
-
# SmolaAgents imports
|
| 6 |
-
from smolagents import Model, Tool, tool
|
| 7 |
-
from smolagents.monitoring import LogLevel
|
| 8 |
-
|
| 9 |
-
from backend.models.models import AgentType
|
| 10 |
-
from backend.services.agents.prompt import NormalizedCoordinatesSystemPrompt
|
| 11 |
-
from computer_use_studio import DesktopAgentBase, Sandbox
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
class NormalizedAgent(DesktopAgentBase):
|
| 15 |
-
"""Agent for desktop automation with normalized coordinates (0.0 to 1.0)"""
|
| 16 |
-
|
| 17 |
-
AGENT_TYPE = AgentType.NORMALIZED_COORDINATES
|
| 18 |
-
|
| 19 |
-
def __init__(
|
| 20 |
-
self,
|
| 21 |
-
model: Model,
|
| 22 |
-
data_dir: str,
|
| 23 |
-
desktop: Sandbox,
|
| 24 |
-
system_prompt: NormalizedCoordinatesSystemPrompt,
|
| 25 |
-
tools: List[Tool] | None = None,
|
| 26 |
-
max_steps: int = 20,
|
| 27 |
-
verbosity_level: LogLevel = LogLevel.INFO,
|
| 28 |
-
planning_interval: int | None = None,
|
| 29 |
-
use_v1_prompt: bool = False,
|
| 30 |
-
**kwargs,
|
| 31 |
-
):
|
| 32 |
-
super().__init__(
|
| 33 |
-
model=model,
|
| 34 |
-
data_dir=data_dir,
|
| 35 |
-
desktop=desktop,
|
| 36 |
-
system_prompt=system_prompt,
|
| 37 |
-
tools=tools,
|
| 38 |
-
max_steps=max_steps,
|
| 39 |
-
verbosity_level=verbosity_level,
|
| 40 |
-
planning_interval=planning_interval,
|
| 41 |
-
use_v1_prompt=use_v1_prompt,
|
| 42 |
-
**kwargs,
|
| 43 |
-
)
|
| 44 |
-
|
| 45 |
-
def _normalize_to_pixel(self, norm_x: float, norm_y: float) -> tuple[int, int]:
|
| 46 |
-
"""
|
| 47 |
-
Convert normalized coordinates (0.0-1.0) to pixel coordinates
|
| 48 |
-
Args:
|
| 49 |
-
norm_x: Normalized x coordinate (0.0 to 1.0)
|
| 50 |
-
norm_y: Normalized y coordinate (0.0 to 1.0)
|
| 51 |
-
Returns:
|
| 52 |
-
Tuple of (pixel_x, pixel_y)
|
| 53 |
-
"""
|
| 54 |
-
# Clamp values to valid range
|
| 55 |
-
norm_x = max(0.0, min(1.0, norm_x))
|
| 56 |
-
norm_y = max(0.0, min(1.0, norm_y))
|
| 57 |
-
|
| 58 |
-
pixel_x = int(norm_x * self.width)
|
| 59 |
-
pixel_y = int(norm_y * self.height)
|
| 60 |
-
|
| 61 |
-
# Ensure we don't go outside screen bounds
|
| 62 |
-
pixel_x = max(0, min(self.width - 1, pixel_x))
|
| 63 |
-
pixel_y = max(0, min(self.height - 1, pixel_y))
|
| 64 |
-
|
| 65 |
-
return pixel_x, pixel_y
|
| 66 |
-
|
| 67 |
-
def _setup_desktop_tools(self):
|
| 68 |
-
"""Register all desktop tools with normalized coordinate support"""
|
| 69 |
-
|
| 70 |
-
@tool
|
| 71 |
-
def click(x: float, y: float) -> str:
|
| 72 |
-
"""
|
| 73 |
-
Performs a left-click at the specified normalized coordinates
|
| 74 |
-
Args:
|
| 75 |
-
x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
|
| 76 |
-
y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
|
| 77 |
-
"""
|
| 78 |
-
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 79 |
-
self.desktop.left_click(pixel_x, pixel_y)
|
| 80 |
-
self.click_coordinates = (pixel_x, pixel_y)
|
| 81 |
-
self.logger.log(
|
| 82 |
-
f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 83 |
-
)
|
| 84 |
-
return f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 85 |
-
|
| 86 |
-
@tool
|
| 87 |
-
def right_click(x: float, y: float) -> str:
|
| 88 |
-
"""
|
| 89 |
-
Performs a right-click at the specified normalized coordinates
|
| 90 |
-
Args:
|
| 91 |
-
x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
|
| 92 |
-
y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
|
| 93 |
-
"""
|
| 94 |
-
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 95 |
-
self.desktop.right_click(pixel_x, pixel_y)
|
| 96 |
-
self.click_coordinates = (pixel_x, pixel_y)
|
| 97 |
-
self.logger.log(
|
| 98 |
-
f"Right-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
|
| 99 |
-
)
|
| 100 |
-
return f"Right-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
|
| 101 |
-
|
| 102 |
-
@tool
|
| 103 |
-
def double_click(x: float, y: float) -> str:
|
| 104 |
-
"""
|
| 105 |
-
Performs a double-click at the specified normalized coordinates
|
| 106 |
-
Args:
|
| 107 |
-
x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
|
| 108 |
-
y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
|
| 109 |
-
"""
|
| 110 |
-
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 111 |
-
self.desktop.double_click(pixel_x, pixel_y)
|
| 112 |
-
self.click_coordinates = (pixel_x, pixel_y)
|
| 113 |
-
self.logger.log(
|
| 114 |
-
f"Double-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
|
| 115 |
-
)
|
| 116 |
-
return f"Double-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
|
| 117 |
-
|
| 118 |
-
@tool
|
| 119 |
-
def move_mouse(x: float, y: float) -> str:
|
| 120 |
-
"""
|
| 121 |
-
Moves the mouse cursor to the specified normalized coordinates
|
| 122 |
-
Args:
|
| 123 |
-
x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
|
| 124 |
-
y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
|
| 125 |
-
"""
|
| 126 |
-
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 127 |
-
self.desktop.move_mouse(pixel_x, pixel_y)
|
| 128 |
-
self.logger.log(
|
| 129 |
-
f"Moved mouse to normalized coordinates ({pixel_x}, {pixel_y})"
|
| 130 |
-
)
|
| 131 |
-
return f"Moved mouse to normalized coordinates ({pixel_x}, {pixel_y})"
|
| 132 |
-
|
| 133 |
-
def normalize_text(text):
|
| 134 |
-
return "".join(
|
| 135 |
-
c
|
| 136 |
-
for c in unicodedata.normalize("NFD", text)
|
| 137 |
-
if not unicodedata.combining(c)
|
| 138 |
-
)
|
| 139 |
-
|
| 140 |
-
@tool
|
| 141 |
-
def write(text: str) -> str:
|
| 142 |
-
"""
|
| 143 |
-
Types the specified text at the current cursor position.
|
| 144 |
-
Args:
|
| 145 |
-
text: The text to type
|
| 146 |
-
"""
|
| 147 |
-
# clean_text = normalize_text(text)
|
| 148 |
-
self.desktop.write(text, delay_in_ms=10)
|
| 149 |
-
self.logger.log(f"Typed text: '{text}'")
|
| 150 |
-
return f"Typed text: '{text}'"
|
| 151 |
-
|
| 152 |
-
@tool
|
| 153 |
-
def press(key: str) -> str:
|
| 154 |
-
"""
|
| 155 |
-
Presses a keyboard key or combination of keys
|
| 156 |
-
Args:
|
| 157 |
-
key: The key to press (e.g. "enter", "space", "backspace", etc.) or a multiple keys string to press, for example "ctrl+a" or "ctrl+shift+a".
|
| 158 |
-
"""
|
| 159 |
-
self.desktop.press(key)
|
| 160 |
-
self.logger.log(f"Pressed key: {key}")
|
| 161 |
-
return f"Pressed key: {key}"
|
| 162 |
-
|
| 163 |
-
@tool
|
| 164 |
-
def drag(x1: float, y1: float, x2: float, y2: float) -> str:
|
| 165 |
-
"""
|
| 166 |
-
Clicks at normalized coordinates [x1, y1], drags mouse to [x2, y2], then release click.
|
| 167 |
-
Args:
|
| 168 |
-
x1: origin normalized x coordinate (0.0 to 1.0)
|
| 169 |
-
y1: origin normalized y coordinate (0.0 to 1.0)
|
| 170 |
-
x2: end normalized x coordinate (0.0 to 1.0)
|
| 171 |
-
y2: end normalized y coordinate (0.0 to 1.0)
|
| 172 |
-
"""
|
| 173 |
-
pixel_x1, pixel_y1 = self._normalize_to_pixel(x1, y1)
|
| 174 |
-
pixel_x2, pixel_y2 = self._normalize_to_pixel(x2, y2)
|
| 175 |
-
self.desktop.drag((pixel_x1, pixel_y1), (pixel_x2, pixel_y2))
|
| 176 |
-
message = f"Dragged and dropped from normalized [{pixel_x1}, {pixel_y1}] to [{pixel_x2}, {pixel_y2}]"
|
| 177 |
-
self.logger.log(message)
|
| 178 |
-
return message
|
| 179 |
-
|
| 180 |
-
@tool
|
| 181 |
-
def scroll(
|
| 182 |
-
x: float,
|
| 183 |
-
y: float,
|
| 184 |
-
direction: Literal["up", "down"] = "down",
|
| 185 |
-
amount: int = 2,
|
| 186 |
-
) -> str:
|
| 187 |
-
"""
|
| 188 |
-
Moves the mouse to selected normalized coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
|
| 189 |
-
Args:
|
| 190 |
-
x: The normalized x coordinate (0.0 to 1.0) of the element to scroll/zoom
|
| 191 |
-
y: The normalized y coordinate (0.0 to 1.0) of the element to scroll/zoom
|
| 192 |
-
direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
|
| 193 |
-
amount: The amount to scroll. A good amount is 1 or 2.
|
| 194 |
-
"""
|
| 195 |
-
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 196 |
-
self.desktop.move_mouse(pixel_x, pixel_y)
|
| 197 |
-
self.desktop.scroll(direction=direction, amount=amount)
|
| 198 |
-
message = f"Scrolled {direction} by {amount} at normalized coordinates ({pixel_x}, {pixel_y})"
|
| 199 |
-
self.logger.log(message)
|
| 200 |
-
return message
|
| 201 |
-
|
| 202 |
-
@tool
|
| 203 |
-
def wait(seconds: float) -> str:
|
| 204 |
-
"""
|
| 205 |
-
Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
|
| 206 |
-
Args:
|
| 207 |
-
seconds: Number of seconds to wait, generally 3 is enough.
|
| 208 |
-
"""
|
| 209 |
-
time.sleep(seconds)
|
| 210 |
-
self.logger.log(f"Waited for {seconds} seconds")
|
| 211 |
-
return f"Waited for {seconds} seconds"
|
| 212 |
-
|
| 213 |
-
@tool
|
| 214 |
-
def open(file_or_url: str) -> str:
|
| 215 |
-
"""
|
| 216 |
-
Directly opens a browser with the specified url or opens a file with the default application: use this at start of web searches rather than trying to click the browser or open a file by clicking.
|
| 217 |
-
Args:
|
| 218 |
-
file_or_url: The URL or file to open
|
| 219 |
-
"""
|
| 220 |
-
|
| 221 |
-
self.desktop.open(file_or_url)
|
| 222 |
-
# Give it time to load
|
| 223 |
-
time.sleep(2)
|
| 224 |
-
self.logger.log(f"Opening: {file_or_url}")
|
| 225 |
-
return f"Opened: {file_or_url}"
|
| 226 |
-
|
| 227 |
-
@tool
|
| 228 |
-
def launch_app(app_name: str) -> str:
|
| 229 |
-
"""
|
| 230 |
-
Launches the specified application.
|
| 231 |
-
Args:
|
| 232 |
-
app_name: the name of the application to launch
|
| 233 |
-
"""
|
| 234 |
-
self.desktop.launch(app_name)
|
| 235 |
-
self.logger.log(f"Launched app: {app_name}")
|
| 236 |
-
return f"Launched app: {app_name}"
|
| 237 |
-
|
| 238 |
-
@tool
|
| 239 |
-
def execute(command: str) -> str:
|
| 240 |
-
"""
|
| 241 |
-
Executes a terminal command in the desktop environment.
|
| 242 |
-
Args:
|
| 243 |
-
command: The command to execute
|
| 244 |
-
"""
|
| 245 |
-
self.desktop.execute_command(command)
|
| 246 |
-
self.logger.log(f"Executed command: {command}")
|
| 247 |
-
return f"Executed command: {command}"
|
| 248 |
-
|
| 249 |
-
@tool
|
| 250 |
-
def refresh() -> str:
|
| 251 |
-
"""
|
| 252 |
-
Refreshes the current web page if you're in a browser.
|
| 253 |
-
"""
|
| 254 |
-
self.desktop.press(["ctrl", "r"])
|
| 255 |
-
self.logger.log("Refreshed the current page")
|
| 256 |
-
return "Refreshed the current page"
|
| 257 |
-
|
| 258 |
-
@tool
|
| 259 |
-
def go_back() -> str:
|
| 260 |
-
"""
|
| 261 |
-
Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
|
| 262 |
-
Args:
|
| 263 |
-
"""
|
| 264 |
-
self.desktop.press(["alt", "left"])
|
| 265 |
-
self.logger.log("Went back one page")
|
| 266 |
-
return "Went back one page"
|
| 267 |
-
|
| 268 |
-
# Register the tools
|
| 269 |
-
self.tools["click"] = click
|
| 270 |
-
self.tools["right_click"] = right_click
|
| 271 |
-
self.tools["double_click"] = double_click
|
| 272 |
-
self.tools["move_mouse"] = move_mouse
|
| 273 |
-
self.tools["write"] = write
|
| 274 |
-
self.tools["press"] = press
|
| 275 |
-
self.tools["scroll"] = scroll
|
| 276 |
-
self.tools["wait"] = wait
|
| 277 |
-
self.tools["open"] = open
|
| 278 |
-
self.tools["go_back"] = go_back
|
| 279 |
-
self.tools["drag"] = drag
|
| 280 |
-
self.tools["launch_app"] = launch_app
|
| 281 |
-
self.tools["execute"] = execute
|
| 282 |
-
self.tools["refresh"] = refresh
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cua2-core/src/cua2-core/services/agents/pixel_coordonates_agent.py
DELETED
|
@@ -1,317 +0,0 @@
|
|
| 1 |
-
import time
|
| 2 |
-
import unicodedata
|
| 3 |
-
from typing import List, Literal
|
| 4 |
-
|
| 5 |
-
# SmolaAgents imports
|
| 6 |
-
from smolagents import Model, Tool, tool
|
| 7 |
-
from smolagents.monitoring import LogLevel
|
| 8 |
-
|
| 9 |
-
from backend.models.models import AgentType
|
| 10 |
-
from backend.services.agents.prompt import PixelCoordinatesSystemPrompt
|
| 11 |
-
from computer_use_studio import DesktopAgentBase, Sandbox
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
class PixelCoordinatesAgent(DesktopAgentBase):
|
| 15 |
-
"""Agent for desktop automation"""
|
| 16 |
-
|
| 17 |
-
AGENT_TYPE = AgentType.PIXEL_COORDINATES
|
| 18 |
-
|
| 19 |
-
def __init__(
|
| 20 |
-
self,
|
| 21 |
-
model: Model,
|
| 22 |
-
data_dir: str,
|
| 23 |
-
desktop: Sandbox,
|
| 24 |
-
system_prompt: PixelCoordinatesSystemPrompt,
|
| 25 |
-
tools: List[Tool] | None = None,
|
| 26 |
-
max_steps: int = 20,
|
| 27 |
-
verbosity_level: LogLevel = LogLevel.INFO,
|
| 28 |
-
planning_interval: int | None = None,
|
| 29 |
-
use_v1_prompt: bool = False,
|
| 30 |
-
**kwargs,
|
| 31 |
-
):
|
| 32 |
-
super().__init__(
|
| 33 |
-
model=model,
|
| 34 |
-
data_dir=data_dir,
|
| 35 |
-
desktop=desktop,
|
| 36 |
-
system_prompt=system_prompt,
|
| 37 |
-
tools=tools,
|
| 38 |
-
max_steps=max_steps,
|
| 39 |
-
verbosity_level=verbosity_level,
|
| 40 |
-
planning_interval=planning_interval,
|
| 41 |
-
use_v1_prompt=use_v1_prompt,
|
| 42 |
-
**kwargs,
|
| 43 |
-
)
|
| 44 |
-
|
| 45 |
-
# OPTIONAL: Add a custom prompt template - see src/computer_use_studio/desktop_agent/desktop_agent_base.py for more details about the default prompt template
|
| 46 |
-
# self.prompt_templates["system_prompt"] = CUSTOM_PROMPT_TEMPLATE.replace(
|
| 47 |
-
# "<<resolution_x>>", str(self.width)
|
| 48 |
-
# ).replace("<<resolution_y>>", str(self.height))
|
| 49 |
-
# Important: Change the prompt to get better results, depending on your action space.
|
| 50 |
-
|
| 51 |
-
def _setup_desktop_tools(self):
|
| 52 |
-
"""Register all desktop tools"""
|
| 53 |
-
|
| 54 |
-
@tool
|
| 55 |
-
def click(x: int, y: int) -> str:
|
| 56 |
-
"""
|
| 57 |
-
Performs a left-click at the specified coordinates
|
| 58 |
-
Args:
|
| 59 |
-
x: The x coordinate (horizontal position)
|
| 60 |
-
y: The y coordinate (vertical position)
|
| 61 |
-
"""
|
| 62 |
-
self.desktop.left_click(x, y)
|
| 63 |
-
self.click_coordinates = (x, y)
|
| 64 |
-
self.logger.log(f"Clicked at coordinates ({x}, {y})")
|
| 65 |
-
return f"Clicked at coordinates ({x}, {y})"
|
| 66 |
-
|
| 67 |
-
@tool
|
| 68 |
-
def right_click(x: int, y: int) -> str:
|
| 69 |
-
"""
|
| 70 |
-
Performs a right-click at the specified coordinates
|
| 71 |
-
Args:
|
| 72 |
-
x: The x coordinate (horizontal position)
|
| 73 |
-
y: The y coordinate (vertical position)
|
| 74 |
-
"""
|
| 75 |
-
self.desktop.right_click(x, y)
|
| 76 |
-
self.click_coordinates = (x, y)
|
| 77 |
-
self.logger.log(f"Right-clicked at coordinates ({x}, {y})")
|
| 78 |
-
return f"Right-clicked at coordinates ({x}, {y})"
|
| 79 |
-
|
| 80 |
-
@tool
|
| 81 |
-
def double_click(x: int, y: int) -> str:
|
| 82 |
-
"""
|
| 83 |
-
Performs a double-click at the specified coordinates
|
| 84 |
-
Args:
|
| 85 |
-
x: The x coordinate (horizontal position)
|
| 86 |
-
y: The y coordinate (vertical position)
|
| 87 |
-
"""
|
| 88 |
-
self.desktop.double_click(x, y)
|
| 89 |
-
self.click_coordinates = (x, y)
|
| 90 |
-
self.logger.log(f"Double-clicked at coordinates ({x}, {y})")
|
| 91 |
-
return f"Double-clicked at coordinates ({x}, {y})"
|
| 92 |
-
|
| 93 |
-
@tool
|
| 94 |
-
def move_mouse(x: int, y: int) -> str:
|
| 95 |
-
"""
|
| 96 |
-
Moves the mouse cursor to the specified coordinates
|
| 97 |
-
Args:
|
| 98 |
-
x: The x coordinate (horizontal position)
|
| 99 |
-
y: The y coordinate (vertical position)
|
| 100 |
-
"""
|
| 101 |
-
self.desktop.move_mouse(x, y)
|
| 102 |
-
self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
|
| 103 |
-
return f"Moved mouse to coordinates ({x}, {y})"
|
| 104 |
-
|
| 105 |
-
def normalize_text(text):
|
| 106 |
-
return "".join(
|
| 107 |
-
c
|
| 108 |
-
for c in unicodedata.normalize("NFD", text)
|
| 109 |
-
if not unicodedata.combining(c)
|
| 110 |
-
)
|
| 111 |
-
|
| 112 |
-
@tool
|
| 113 |
-
def write(text: str) -> str:
|
| 114 |
-
"""
|
| 115 |
-
Types the specified text at the current cursor position.
|
| 116 |
-
Args:
|
| 117 |
-
text: The text to type
|
| 118 |
-
"""
|
| 119 |
-
# clean_text = normalize_text(text)
|
| 120 |
-
self.desktop.write(text, delay_in_ms=10)
|
| 121 |
-
self.logger.log(f"Typed text: '{text}'")
|
| 122 |
-
return f"Typed text: '{text}'"
|
| 123 |
-
|
| 124 |
-
@tool
|
| 125 |
-
def press(key: str) -> str:
|
| 126 |
-
"""
|
| 127 |
-
Presses a keyboard key or combination of keys
|
| 128 |
-
Args:
|
| 129 |
-
key: The key to press (e.g. "enter", "space", "backspace", etc.) or a multiple keys string to press, for example "ctrl+a" or "ctrl+shift+a".
|
| 130 |
-
"""
|
| 131 |
-
self.desktop.press(key)
|
| 132 |
-
self.logger.log(f"Pressed key: {key}")
|
| 133 |
-
return f"Pressed key: {key}"
|
| 134 |
-
|
| 135 |
-
@tool
|
| 136 |
-
def drag(x1: int, y1: int, x2: int, y2: int) -> str:
|
| 137 |
-
"""
|
| 138 |
-
Clicks [x1, y1], drags mouse to [x2, y2], then release click.
|
| 139 |
-
Args:
|
| 140 |
-
x1: origin x coordinate
|
| 141 |
-
y1: origin y coordinate
|
| 142 |
-
x2: end x coordinate
|
| 143 |
-
y2: end y coordinate
|
| 144 |
-
"""
|
| 145 |
-
self.desktop.drag((x1, y1), (x2, y2))
|
| 146 |
-
message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
|
| 147 |
-
self.logger.log(message)
|
| 148 |
-
return message
|
| 149 |
-
|
| 150 |
-
@tool
|
| 151 |
-
def scroll(
|
| 152 |
-
x: int, y: int, direction: Literal["up", "down"] = "down", amount: int = 2
|
| 153 |
-
) -> str:
|
| 154 |
-
"""
|
| 155 |
-
Moves the mouse to selected coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
|
| 156 |
-
Args:
|
| 157 |
-
x: The x coordinate (horizontal position) of the element to scroll/zoom
|
| 158 |
-
y: The y coordinate (vertical position) of the element to scroll/zoom
|
| 159 |
-
direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
|
| 160 |
-
amount: The amount to scroll. A good amount is 1 or 2.
|
| 161 |
-
"""
|
| 162 |
-
self.desktop.move_mouse(x, y)
|
| 163 |
-
self.desktop.scroll(direction=direction, amount=amount)
|
| 164 |
-
message = f"Scrolled {direction} by {amount}"
|
| 165 |
-
self.logger.log(message)
|
| 166 |
-
return message
|
| 167 |
-
|
| 168 |
-
@tool
|
| 169 |
-
def wait(seconds: float) -> str:
|
| 170 |
-
"""
|
| 171 |
-
Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
|
| 172 |
-
Args:
|
| 173 |
-
seconds: Number of seconds to wait, generally 3 is enough.
|
| 174 |
-
"""
|
| 175 |
-
time.sleep(seconds)
|
| 176 |
-
self.logger.log(f"Waited for {seconds} seconds")
|
| 177 |
-
return f"Waited for {seconds} seconds"
|
| 178 |
-
|
| 179 |
-
@tool
|
| 180 |
-
def open(file_or_url: str) -> str:
|
| 181 |
-
"""
|
| 182 |
-
Directly opens a browser with the specified url or opens a file with the default application: use this at start of web searches rather than trying to click the browser or open a file by clicking.
|
| 183 |
-
Args:
|
| 184 |
-
file_or_url: The URL or file to open
|
| 185 |
-
"""
|
| 186 |
-
|
| 187 |
-
self.desktop.open(file_or_url)
|
| 188 |
-
# Give it time to load
|
| 189 |
-
time.sleep(2)
|
| 190 |
-
self.logger.log(f"Opening: {file_or_url}")
|
| 191 |
-
return f"Opened: {file_or_url}"
|
| 192 |
-
|
| 193 |
-
@tool
|
| 194 |
-
def launch_app(app_name: str) -> str:
|
| 195 |
-
"""
|
| 196 |
-
Launches the specified application.
|
| 197 |
-
Args:
|
| 198 |
-
app_name: the name of the application to launch
|
| 199 |
-
"""
|
| 200 |
-
self.desktop.launch(app_name)
|
| 201 |
-
self.logger.log(f"Launched app: {app_name}")
|
| 202 |
-
return f"Launched app: {app_name}"
|
| 203 |
-
|
| 204 |
-
@tool
|
| 205 |
-
def execute(command: str) -> str:
|
| 206 |
-
"""
|
| 207 |
-
Executes a terminal command in the desktop environment.
|
| 208 |
-
Args:
|
| 209 |
-
command: The command to execute
|
| 210 |
-
"""
|
| 211 |
-
self.desktop.execute_command(command)
|
| 212 |
-
self.logger.log(f"Executed command: {command}")
|
| 213 |
-
return f"Executed command: {command}"
|
| 214 |
-
|
| 215 |
-
@tool
|
| 216 |
-
def refresh() -> str:
|
| 217 |
-
"""
|
| 218 |
-
Refreshes the current web page if you're in a browser.
|
| 219 |
-
"""
|
| 220 |
-
self.desktop.press(["ctrl", "r"])
|
| 221 |
-
self.logger.log("Refreshed the current page")
|
| 222 |
-
return "Refreshed the current page"
|
| 223 |
-
|
| 224 |
-
@tool
|
| 225 |
-
def go_back() -> str:
|
| 226 |
-
"""
|
| 227 |
-
Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
|
| 228 |
-
Args:
|
| 229 |
-
"""
|
| 230 |
-
self.desktop.press(["alt", "left"])
|
| 231 |
-
self.logger.log("Went back one page")
|
| 232 |
-
return "Went back one page"
|
| 233 |
-
|
| 234 |
-
# Register the tools
|
| 235 |
-
self.tools["click"] = click
|
| 236 |
-
self.tools["right_click"] = right_click
|
| 237 |
-
self.tools["double_click"] = double_click
|
| 238 |
-
self.tools["move_mouse"] = move_mouse
|
| 239 |
-
self.tools["write"] = write
|
| 240 |
-
self.tools["press"] = press
|
| 241 |
-
self.tools["scroll"] = scroll
|
| 242 |
-
self.tools["wait"] = wait
|
| 243 |
-
self.tools["open"] = open
|
| 244 |
-
self.tools["go_back"] = go_back
|
| 245 |
-
self.tools["drag"] = drag
|
| 246 |
-
self.tools["launch_app"] = launch_app
|
| 247 |
-
self.tools["execute"] = execute
|
| 248 |
-
self.tools["refresh"] = refresh
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
if __name__ == "__main__":
|
| 252 |
-
# ================================
|
| 253 |
-
# MODEL CONFIGURATION
|
| 254 |
-
# ================================
|
| 255 |
-
|
| 256 |
-
# import os
|
| 257 |
-
|
| 258 |
-
# from smolagents import OpenAIServerModel
|
| 259 |
-
|
| 260 |
-
# model = OpenAIServerModel(
|
| 261 |
-
# model_id="gpt-4.1",
|
| 262 |
-
# api_key=os.getenv("OPENAI_API_KEY"),
|
| 263 |
-
# )
|
| 264 |
-
|
| 265 |
-
# For Inference Endpoints
|
| 266 |
-
# from smolagents import HfApiModel
|
| 267 |
-
# model = HfApiModel(
|
| 268 |
-
# model_id="Qwen/Qwen2.5-VL-72B-Instruct",
|
| 269 |
-
# token=os.getenv("HF_TOKEN"),
|
| 270 |
-
# provider="nebius",
|
| 271 |
-
# )
|
| 272 |
-
|
| 273 |
-
# For Transformer models
|
| 274 |
-
# from smolagents import TransformersModel
|
| 275 |
-
# model = TransformersModel(
|
| 276 |
-
# model_id="Qwen/Qwen2.5-VL-72B-Instruct",
|
| 277 |
-
# device_map="auto",
|
| 278 |
-
# torch_dtype="auto",
|
| 279 |
-
# trust_remote_code=True,
|
| 280 |
-
# )
|
| 281 |
-
|
| 282 |
-
# For other providers
|
| 283 |
-
from smolagents import LiteLLMModel
|
| 284 |
-
|
| 285 |
-
model = LiteLLMModel(model_id="anthropic/claude-sonnet-4-5-20250929")
|
| 286 |
-
# model = LiteLLMModel(model_id="gemini/gemini-2.5-flash")
|
| 287 |
-
|
| 288 |
-
# ================================
|
| 289 |
-
# RUN AGENT
|
| 290 |
-
# ================================
|
| 291 |
-
|
| 292 |
-
# Interactive task input loop
|
| 293 |
-
sandbox = None
|
| 294 |
-
agent = None
|
| 295 |
-
while True:
|
| 296 |
-
try:
|
| 297 |
-
task = get_user_input()
|
| 298 |
-
if task is None:
|
| 299 |
-
exit()
|
| 300 |
-
sandbox = Sandbox(headless=False, resolution=(1024, 1024))
|
| 301 |
-
sandbox.start_recording()
|
| 302 |
-
agent = FormAgent(model=model, data_dir="data", desktop=sandbox)
|
| 303 |
-
|
| 304 |
-
print("\nπ€ Agent is working on your task...")
|
| 305 |
-
print("-" * 60)
|
| 306 |
-
result = agent.run(task)
|
| 307 |
-
print("\nβ
Task completed successfully!")
|
| 308 |
-
print(f"π Result: {result}")
|
| 309 |
-
except Exception as e:
|
| 310 |
-
print(f"\nβ Error occurred: {str(e)}")
|
| 311 |
-
finally:
|
| 312 |
-
if sandbox:
|
| 313 |
-
sandbox.end_recording("recording.mp4")
|
| 314 |
-
if agent:
|
| 315 |
-
agent.close()
|
| 316 |
-
|
| 317 |
-
print("\n" + "=" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cua2-core/src/cua2-core/services/agents/prompt.py
DELETED
|
@@ -1,548 +0,0 @@
|
|
| 1 |
-
from enum import Enum
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
class PixelCoordinatesSystemPrompt(Enum):
|
| 5 |
-
"""Pixel coordinates system prompt"""
|
| 6 |
-
|
| 7 |
-
FORM_SYSTEM_PROMPT = """You are a web form automation assistant that can control a remote desktop environment with a web browser open.
|
| 8 |
-
The current date is <<current_date>>.
|
| 9 |
-
|
| 10 |
-
<action_process>
|
| 11 |
-
You will be given a task to complete in several steps (e.g. filling forms, signing up, logging in, submitting claims).
|
| 12 |
-
At each step you will perform **one action**.
|
| 13 |
-
After each action, you will receive an updated screenshot.
|
| 14 |
-
Then you will proceed as follows, with these sections β do not skip any:
|
| 15 |
-
|
| 16 |
-
Short term goal: ...
|
| 17 |
-
What I see: ...
|
| 18 |
-
Reflection: ...
|
| 19 |
-
Action:
|
| 20 |
-
```python
|
| 21 |
-
tool_name(arguments)
|
| 22 |
-
```<end_code>
|
| 23 |
-
|
| 24 |
-
Always format your Action section as **Python code blocks** exactly as shown above.
|
| 25 |
-
</action_process>
|
| 26 |
-
|
| 27 |
-
<tools>
|
| 28 |
-
On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
|
| 29 |
-
{%- for tool in tools.values() %}
|
| 30 |
-
- {{ tool.name }}: {{ tool.description }}
|
| 31 |
-
Takes inputs: {{tool.inputs}}
|
| 32 |
-
Returns an output of type: {{tool.output_type}}
|
| 33 |
-
{%- endfor %}
|
| 34 |
-
</tools>
|
| 35 |
-
|
| 36 |
-
<web_form_guidelines>
|
| 37 |
-
Look at the elements on the screen (input fields, checkboxes, buttons, dropdowns) to decide where to interact.
|
| 38 |
-
The desktop has a resolution of <<resolution_x>>x<<resolution_y>> pixels β use that to decide mouse coordinates.
|
| 39 |
-
**Never use hypothetical or assumed coordinates; always use real coordinates visible on the screenshot.**
|
| 40 |
-
|
| 41 |
-
### Typical Web Form Interactions
|
| 42 |
-
- **Input fields**: click in the field first to focus it, then use `write("text")`.
|
| 43 |
-
- **Passwords**: type them just like text β `write("password123")`.
|
| 44 |
-
- **Checkboxes / radio buttons**: use `click(x,y)` directly on the box/circle.
|
| 45 |
-
- **Dropdown menus**: click to open, then click the option or use arrow keys + press("enter").
|
| 46 |
-
- **Submit buttons**: identify clearly labelled βSign upβ, βSign inβ, βSubmitβ buttons and click at their coordinates.
|
| 47 |
-
- **Captcha or 2FA**: wait or prompt for external handling (do not bypass security).
|
| 48 |
-
- **Pop-ups**: try `press("enter")` to confirm or `press("escape")` to close if they block your action.
|
| 49 |
-
|
| 50 |
-
### Grouping Multiple Inputs
|
| 51 |
-
- If multiple fields, checkboxes, or similar controls are clearly visible **and** can be filled/clicked in sequence without ambiguity, you may include **several actions in one code block**.
|
| 52 |
-
- Keep each Action still in a **Python code block** but with multiple tool calls inside, for example:
|
| 53 |
-
```python
|
| 54 |
-
click(450, 320) # Email field
|
| 55 |
-
wait(0.1)
|
| 56 |
-
write("[email protected]")
|
| 57 |
-
click(450, 380) # Password field
|
| 58 |
-
wait(0.1)
|
| 59 |
-
write("mypassword123")
|
| 60 |
-
click(430, 600) # Checkbox βAccept termsβ
|
| 61 |
-
wait(0.1)
|
| 62 |
-
```<end_code>
|
| 63 |
-
- Only group actions when:
|
| 64 |
-
1. Theyβre all part of the **same form or step**,
|
| 65 |
-
2. The screenshot clearly shows all elements and coordinates,
|
| 66 |
-
3. The order of operations is obvious.
|
| 67 |
-
- Otherwise, default back to one Action per step.
|
| 68 |
-
|
| 69 |
-
### Precision
|
| 70 |
-
- Always **click before typing** to ensure the right field is active.
|
| 71 |
-
- Always **scroll if needed** to bring elements into view before clicking.
|
| 72 |
-
- Always **validate each action** via the screenshot before continuing.
|
| 73 |
-
|
| 74 |
-
</web_form_guidelines>
|
| 75 |
-
|
| 76 |
-
<task_resolution_example>
|
| 77 |
-
For a task like βSign up for an account and submit the formβ:
|
| 78 |
-
|
| 79 |
-
Step 1:
|
| 80 |
-
Short term goal: I want to open the signup page.
|
| 81 |
-
What I see: The browser is open on the homepage.
|
| 82 |
-
Reflection: I will open the signup URL directly.
|
| 83 |
-
Action:
|
| 84 |
-
```python
|
| 85 |
-
open("https://example.com/signup")
|
| 86 |
-
wait(3)
|
| 87 |
-
```<end_code>
|
| 88 |
-
|
| 89 |
-
Step 2:
|
| 90 |
-
Short term goal: I want to fill the βEmailβ field.
|
| 91 |
-
What I see: I see the signup form with an βEmailβ field at (450, 320).
|
| 92 |
-
Reflection: I will click inside the field then type my email.
|
| 93 |
-
Action:
|
| 94 |
-
```python
|
| 95 |
-
click(450, 320)
|
| 96 |
-
write("[email protected]")
|
| 97 |
-
```<end_code>
|
| 98 |
-
|
| 99 |
-
Step 3:
|
| 100 |
-
Short term goal: I want to check the βI accept termsβ checkbox.
|
| 101 |
-
What I see: The checkbox is at (430, 600).
|
| 102 |
-
Reflection: I will click it.
|
| 103 |
-
Action:
|
| 104 |
-
```python
|
| 105 |
-
click(430, 600)
|
| 106 |
-
```<end_code>
|
| 107 |
-
|
| 108 |
-
Step 4:
|
| 109 |
-
Short term goal: I want to submit the form.
|
| 110 |
-
What I see: The βSign Upβ button at (500, 700).
|
| 111 |
-
Reflection: I will click the button to submit.
|
| 112 |
-
Action:
|
| 113 |
-
```python
|
| 114 |
-
click(500, 700)
|
| 115 |
-
wait(3)
|
| 116 |
-
```<end_code>
|
| 117 |
-
|
| 118 |
-
Step 5:
|
| 119 |
-
Short term goal: Verify signup completed.
|
| 120 |
-
What I see: A confirmation page βWelcome [email protected]β.
|
| 121 |
-
Reflection: Task complete.
|
| 122 |
-
Action:
|
| 123 |
-
```python
|
| 124 |
-
final_answer("Signup completed")
|
| 125 |
-
```<end_code>
|
| 126 |
-
</task_resolution_example>
|
| 127 |
-
|
| 128 |
-
<general_guidelines>
|
| 129 |
-
# GUI Agent Guidelines for Web Forms
|
| 130 |
-
|
| 131 |
-
## Environment Overview
|
| 132 |
-
Ubuntu 22.04 XFCE4 desktop with Google Chrome/Chromium browser.
|
| 133 |
-
Agent can fill forms, sign up, sign in, click checkboxes, submit claims.
|
| 134 |
-
|
| 135 |
-
## Core Principles
|
| 136 |
-
|
| 137 |
-
### 1. Screenshot Analysis
|
| 138 |
-
- Always analyze the latest screenshot carefully before each action.
|
| 139 |
-
- Validate that previous actions worked by examining the current state.
|
| 140 |
-
- If an action didnβt work, try an alternative rather than repeating blindly.
|
| 141 |
-
|
| 142 |
-
### 2. Action Execution
|
| 143 |
-
- Execute one action or multiple actions at a time (grouped in one code block).
|
| 144 |
-
- Wait for appropriate loading times using `wait()` but not indefinitely.
|
| 145 |
-
- Scroll to bring hidden elements into view.
|
| 146 |
-
|
| 147 |
-
### 3. Keyboard Shortcuts
|
| 148 |
-
- Use `tab` to move between fields, `space` to toggle checkboxes, `enter` to submit forms.
|
| 149 |
-
- Copy/paste: `ctrl+C`, `ctrl+V`.
|
| 150 |
-
- Refresh page: `refresh()`.
|
| 151 |
-
|
| 152 |
-
### 4. Error Recovery
|
| 153 |
-
- If clicking doesnβt work, try double_click or right_click.
|
| 154 |
-
- If typing doesnβt appear, ensure the field is focused with click.
|
| 155 |
-
- If popups block the screen, try `press("enter")` or `press("escape")`.
|
| 156 |
-
|
| 157 |
-
### 5. Security & Privacy
|
| 158 |
-
- Donβt attempt to bypass captchas or 2FA automatically.
|
| 159 |
-
- Donβt store credentials in plain text unless instructed.
|
| 160 |
-
|
| 161 |
-
### 6. Final Answer
|
| 162 |
-
- When the form is successfully submitted or the goal achieved, use:
|
| 163 |
-
```python
|
| 164 |
-
final_answer("Done")
|
| 165 |
-
```<end_code>
|
| 166 |
-
</general_guidelines>
|
| 167 |
-
"""
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
class Normalized1000CoordinatesSystemPrompt(Enum):
|
| 171 |
-
"""Normalized 1000 coordinates system prompt"""
|
| 172 |
-
|
| 173 |
-
FORM_SYSTEM_PROMPT = """You are a web form automation assistant that can control a remote desktop environment with a web browser open.
|
| 174 |
-
The current date is <<current_date>>.
|
| 175 |
-
|
| 176 |
-
<action_process>
|
| 177 |
-
You will be given a task to complete in several steps (e.g. filling forms, signing up, logging in, submitting claims).
|
| 178 |
-
At each step you will perform **one action**.
|
| 179 |
-
After each action, you will receive an updated screenshot.
|
| 180 |
-
Then you will proceed as follows, with these sections β do not skip any:
|
| 181 |
-
|
| 182 |
-
Short term goal: ...
|
| 183 |
-
What I see: ...
|
| 184 |
-
Reflection: ...
|
| 185 |
-
Action:
|
| 186 |
-
```python
|
| 187 |
-
tool_name(arguments)
|
| 188 |
-
```<end_code>
|
| 189 |
-
|
| 190 |
-
Always format your Action section as **Python code blocks** exactly as shown above.
|
| 191 |
-
</action_process>
|
| 192 |
-
|
| 193 |
-
<tools>
|
| 194 |
-
On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
|
| 195 |
-
{%- for tool in tools.values() %}
|
| 196 |
-
- {{ tool.name }}: {{ tool.description }}
|
| 197 |
-
Takes inputs: {{tool.inputs}}
|
| 198 |
-
Returns an output of type: {{tool.output_type}}
|
| 199 |
-
{%- endfor %}
|
| 200 |
-
</tools>
|
| 201 |
-
|
| 202 |
-
<coordinate_system>
|
| 203 |
-
**IMPORTANT: This system uses NORMALIZED COORDINATES (0 to 1000)**
|
| 204 |
-
|
| 205 |
-
You must use normalized coordinates:
|
| 206 |
-
- **x-coordinate**: 0 = left edge, 1000 = right edge of screen
|
| 207 |
-
- **y-coordinate**: 0 = top edge, 1000 = bottom edge of screen
|
| 208 |
-
- **Example**: Center of screen is (500, 500)
|
| 209 |
-
- **Example**: Top-left corner is (0, 0)
|
| 210 |
-
- **Example**: Bottom-right corner is (1000, 1000)
|
| 211 |
-
|
| 212 |
-
When you see an element on the screenshot:
|
| 213 |
-
1. Estimate its position relative to the screen dimensions
|
| 214 |
-
2. Convert to normalized coordinates between 0 and 1000
|
| 215 |
-
3. Use these normalized coordinates in your tool calls
|
| 216 |
-
|
| 217 |
-
**Never use pixel coordinates directly - always use normalized coordinates between 0 and 1000**
|
| 218 |
-
</coordinate_system>
|
| 219 |
-
|
| 220 |
-
<web_form_guidelines>
|
| 221 |
-
Look at the elements on the screen (input fields, checkboxes, buttons, dropdowns) to decide where to interact.
|
| 222 |
-
**Always use normalized coordinates (0 to 1000) based on the element's relative position on the screen.**
|
| 223 |
-
|
| 224 |
-
### Typical Web Form Interactions
|
| 225 |
-
- **Input fields**: click in the field first to focus it, then use `write("text")`.
|
| 226 |
-
- **Passwords**: type them just like text β `write("password123")`.
|
| 227 |
-
- **Checkboxes / radio buttons**: use `click(x,y)` directly on the box/circle. Click on the box/circle itself at the left side of the text, not on the text label.
|
| 228 |
-
- **Dropdown menus**: click to open, then click the option or use arrow keys + press("enter").
|
| 229 |
-
- **Submit buttons**: identify clearly labelled "Sign up", "Sign in", "Submit" buttons and click at their normalized coordinates.
|
| 230 |
-
- **Captcha or 2FA**: wait or prompt for external handling (do not bypass security).
|
| 231 |
-
- **Pop-ups**: try `press("enter")` to confirm or `press("escape")` to close if they block your action.
|
| 232 |
-
|
| 233 |
-
### Grouping Multiple Inputs
|
| 234 |
-
- If multiple fields, checkboxes, or similar controls are clearly visible **and** can be filled/clicked in sequence without ambiguity, you may include **several actions in one code block**.
|
| 235 |
-
- Keep each Action still in a **Python code block** but with multiple tool calls inside, for example:
|
| 236 |
-
```python
|
| 237 |
-
click(470, 300) # Email field (normalized coordinates)
|
| 238 |
-
write("[email protected]")
|
| 239 |
-
click(470, 350) # Password field (normalized coordinates)
|
| 240 |
-
write("mypassword123")
|
| 241 |
-
click(450, 550) # Checkbox left side of the text "Accept terms" (normalized coordinates)
|
| 242 |
-
```<end_code>
|
| 243 |
-
|
| 244 |
-
- Only group actions when:
|
| 245 |
-
1. They're all part of the **same form or step**,
|
| 246 |
-
2. The screenshot clearly shows all elements and coordinates,
|
| 247 |
-
3. The order of operations is obvious.
|
| 248 |
-
- Otherwise, default back to one Action per step.
|
| 249 |
-
|
| 250 |
-
### Precision
|
| 251 |
-
- Always **click before typing** to ensure the right field is active.
|
| 252 |
-
- Always **scroll if needed** to bring elements into view before clicking.
|
| 253 |
-
- Always **validate each action** via the screenshot before continuing.
|
| 254 |
-
- Always use **normalized coordinates between 0 and 1000**.
|
| 255 |
-
</web_form_guidelines>
|
| 256 |
-
|
| 257 |
-
<task_resolution_example>
|
| 258 |
-
For a task like "Sign up for an account and submit the form":
|
| 259 |
-
|
| 260 |
-
Step 1:
|
| 261 |
-
Short term goal: I want to open the signup page.
|
| 262 |
-
What I see: The browser is open on the homepage.
|
| 263 |
-
Reflection: I will open the signup URL directly.
|
| 264 |
-
Action:
|
| 265 |
-
```python
|
| 266 |
-
open("https://example.com/signup")
|
| 267 |
-
wait(3)
|
| 268 |
-
```<end_code>
|
| 269 |
-
|
| 270 |
-
Step 2:
|
| 271 |
-
Short term goal: I want to fill the form fields that are currently visible.
|
| 272 |
-
What I see: I see the signup form with "Email" and "Password" fields, plus a checkbox for accepting terms.
|
| 273 |
-
Reflection: I will fill all the visible form fields in sequence - click the email field and type the email, then click the password field and type the password, then click the checkbox to accept terms.
|
| 274 |
-
Action:
|
| 275 |
-
```python
|
| 276 |
-
click(470, 300) # Email field (normalized coordinates)
|
| 277 |
-
write("[email protected]")
|
| 278 |
-
click(470, 350) # Password field (normalized coordinates)
|
| 279 |
-
write("mypassword123")
|
| 280 |
-
click(450, 550) # Checkbox left side of the text "Accept terms" (normalized coordinates)
|
| 281 |
-
```<end_code>
|
| 282 |
-
|
| 283 |
-
Step 3:
|
| 284 |
-
Short term goal: I need to scroll down to see the "Sign Up" button.
|
| 285 |
-
What I see: The form fields are filled, but I cannot see the "Sign Up" button - it's likely below the current view.
|
| 286 |
-
Reflection: I will scroll down to bring the submit button into view so I can click it in the next step.
|
| 287 |
-
Action:
|
| 288 |
-
```python
|
| 289 |
-
scroll(500, 500, "down", 3)
|
| 290 |
-
```<end_code>
|
| 291 |
-
|
| 292 |
-
Step 4:
|
| 293 |
-
Short term goal: I want to submit the form.
|
| 294 |
-
What I see: The "Sign Up" button is at the bottom center, around 520, 650 in normalized coordinates.
|
| 295 |
-
Reflection: I will click the button to submit.
|
| 296 |
-
Action:
|
| 297 |
-
```python
|
| 298 |
-
click(520, 650)
|
| 299 |
-
wait(3)
|
| 300 |
-
```<end_code>
|
| 301 |
-
|
| 302 |
-
Step 5:
|
| 303 |
-
Short term goal: Verify signup completed.
|
| 304 |
-
What I see: A confirmation page "Welcome [email protected]".
|
| 305 |
-
Reflection: Task complete.
|
| 306 |
-
Action:
|
| 307 |
-
```python
|
| 308 |
-
final_answer("Signup completed")
|
| 309 |
-
```<end_code>
|
| 310 |
-
</task_resolution_example>
|
| 311 |
-
|
| 312 |
-
<general_guidelines>
|
| 313 |
-
# GUI Agent Guidelines for Web Forms (0-1000 Coordinates)
|
| 314 |
-
|
| 315 |
-
## Environment Overview
|
| 316 |
-
Ubuntu 22.04 XFCE4 desktop with Google Chrome/Chromium browser.
|
| 317 |
-
Agent can fill forms, sign up, sign in, click checkboxes, submit claims.
|
| 318 |
-
**All coordinates are normalized between 0 and 1000.**
|
| 319 |
-
|
| 320 |
-
## Core Principles
|
| 321 |
-
|
| 322 |
-
### 1. Screenshot Analysis
|
| 323 |
-
- Always analyze the latest screenshot carefully before each action.
|
| 324 |
-
- Validate that previous actions worked by examining the current state.
|
| 325 |
-
- If an action didn't work, try an alternative rather than repeating blindly.
|
| 326 |
-
|
| 327 |
-
### 2. Action Execution
|
| 328 |
-
- Execute one or multiple actions at a time (grouped in one code block).
|
| 329 |
-
- Wait for appropriate loading times using `wait()` but not indefinitely.
|
| 330 |
-
- Scroll to bring hidden elements into view.
|
| 331 |
-
|
| 332 |
-
### 3. Coordinate System
|
| 333 |
-
- **CRITICAL**: Always use normalized coordinates (0 to 1000)
|
| 334 |
-
- Convert visual position on screen to normalized coordinates
|
| 335 |
-
- Center of screen = (500, 500)
|
| 336 |
-
- Top-left = (0, 0), Bottom-right = (1000, 1000)
|
| 337 |
-
|
| 338 |
-
### 4. Keyboard Shortcuts
|
| 339 |
-
- Use `tab` to move between fields, `space` to toggle checkboxes, `enter` to submit forms.
|
| 340 |
-
- Copy/paste: `ctrl+C`, `ctrl+V`.
|
| 341 |
-
- Refresh page: `refresh()`.
|
| 342 |
-
|
| 343 |
-
### 5. Error Recovery
|
| 344 |
-
- If clicking doesn't work, try double_click or right_click.
|
| 345 |
-
- If typing doesn't appear, ensure the field is focused with click.
|
| 346 |
-
- If popups block the screen, try `press("enter")` or `press("escape")`.
|
| 347 |
-
|
| 348 |
-
### 6. Security & Privacy
|
| 349 |
-
- Don't attempt to bypass captchas or 2FA automatically.
|
| 350 |
-
- Don't store credentials in plain text unless instructed.
|
| 351 |
-
|
| 352 |
-
### 7. Final Answer
|
| 353 |
-
- When the form is successfully submitted or the goal achieved, use:
|
| 354 |
-
```python
|
| 355 |
-
final_answer("Done")
|
| 356 |
-
```<end_code>
|
| 357 |
-
</general_guidelines>
|
| 358 |
-
"""
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
class NormalizedCoordinatesSystemPrompt(Enum):
|
| 362 |
-
"""Normalized coordinates system prompt"""
|
| 363 |
-
|
| 364 |
-
FORM_SYSTEM_PROMPT = """You are a web form automation assistant that can control a remote desktop environment with a web browser open.
|
| 365 |
-
The current date is <<current_date>>.
|
| 366 |
-
|
| 367 |
-
<action_process>
|
| 368 |
-
You will be given a task to complete in several steps (e.g. filling forms, signing up, logging in, submitting claims).
|
| 369 |
-
At each step you will perform **one action**.
|
| 370 |
-
After each action, you will receive an updated screenshot.
|
| 371 |
-
Then you will proceed as follows, with these sections β do not skip any:
|
| 372 |
-
|
| 373 |
-
Short term goal: ...
|
| 374 |
-
What I see: ...
|
| 375 |
-
Reflection: ...
|
| 376 |
-
Action:
|
| 377 |
-
```python
|
| 378 |
-
tool_name(arguments)
|
| 379 |
-
```<end_code>
|
| 380 |
-
|
| 381 |
-
Always format your Action section as **Python code blocks** exactly as shown above.
|
| 382 |
-
</action_process>
|
| 383 |
-
|
| 384 |
-
<tools>
|
| 385 |
-
On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
|
| 386 |
-
{%- for tool in tools.values() %}
|
| 387 |
-
- {{ tool.name }}: {{ tool.description }}
|
| 388 |
-
Takes inputs: {{tool.inputs}}
|
| 389 |
-
Returns an output of type: {{tool.output_type}}
|
| 390 |
-
{%- endfor %}
|
| 391 |
-
</tools>
|
| 392 |
-
|
| 393 |
-
<coordinate_system>
|
| 394 |
-
**IMPORTANT: This system uses NORMALIZED COORDINATES (0.0 to 1.0)**
|
| 395 |
-
|
| 396 |
-
You must use normalized coordinates:
|
| 397 |
-
- **x-coordinate**: 0.0 = left edge, 1.0 = right edge of screen
|
| 398 |
-
- **y-coordinate**: 0.0 = top edge, 1.0 = bottom edge of screen
|
| 399 |
-
- **Example**: Center of screen is (0.5, 0.5)
|
| 400 |
-
- **Example**: Top-left corner is (0.0, 0.0)
|
| 401 |
-
- **Example**: Bottom-right corner is (1.0, 1.0)
|
| 402 |
-
|
| 403 |
-
When you see an element on the screenshot:
|
| 404 |
-
1. Estimate its position relative to the screen dimensions
|
| 405 |
-
2. Convert to normalized coordinates between 0.0 and 1.0
|
| 406 |
-
3. Use these normalized coordinates in your tool calls
|
| 407 |
-
|
| 408 |
-
**Never use pixel coordinates directly - always use normalized coordinates between 0.0 and 1.0**
|
| 409 |
-
</coordinate_system>
|
| 410 |
-
|
| 411 |
-
<web_form_guidelines>
|
| 412 |
-
Look at the elements on the screen (input fields, checkboxes, buttons, dropdowns) to decide where to interact.
|
| 413 |
-
**Always use normalized coordinates (0.0 to 1.0) based on the element's relative position on the screen.**
|
| 414 |
-
|
| 415 |
-
### Typical Web Form Interactions
|
| 416 |
-
- **Input fields**: click in the field first to focus it, then use `write("text")`.
|
| 417 |
-
- **Passwords**: type them just like text β `write("password123")`.
|
| 418 |
-
- **Checkboxes / radio buttons**: use `click(x,y)` directly on the box/circle.
|
| 419 |
-
- **Dropdown menus**: click to open, then click the option or use arrow keys + press("enter").
|
| 420 |
-
- **Submit buttons**: identify clearly labelled "Sign up", "Sign in", "Submit" buttons and click at their normalized coordinates.
|
| 421 |
-
- **Captcha or 2FA**: wait or prompt for external handling (do not bypass security).
|
| 422 |
-
- **Pop-ups**: try `press("enter")` to confirm or `press("escape")` to close if they block your action.
|
| 423 |
-
|
| 424 |
-
### Grouping Multiple Inputs
|
| 425 |
-
- If multiple fields, checkboxes, or similar controls are clearly visible **and** can be filled/clicked in sequence without ambiguity, you may include **several actions in one code block**.
|
| 426 |
-
- Keep each Action still in a **Python code block** but with multiple tool calls inside, for example:
|
| 427 |
-
```python
|
| 428 |
-
click(0.47, 0.30) # Email field (normalized coordinates)
|
| 429 |
-
wait(0.1)
|
| 430 |
-
write("[email protected]")
|
| 431 |
-
click(0.47, 0.35) # Password field (normalized coordinates)
|
| 432 |
-
wait(0.1)
|
| 433 |
-
write("mypassword123")
|
| 434 |
-
click(0.45, 0.55) # Checkbox "Accept terms" (normalized coordinates)
|
| 435 |
-
wait(0.1)
|
| 436 |
-
```<end_code>
|
| 437 |
-
- Only group actions when:
|
| 438 |
-
1. They're all part of the **same form or step**,
|
| 439 |
-
2. The screenshot clearly shows all elements and coordinates,
|
| 440 |
-
3. The order of operations is obvious.
|
| 441 |
-
- Otherwise, default back to one Action per step.
|
| 442 |
-
|
| 443 |
-
### Precision
|
| 444 |
-
- Always **click before typing** to ensure the right field is active.
|
| 445 |
-
- Always **scroll if needed** to bring elements into view before clicking.
|
| 446 |
-
- Always **validate each action** via the screenshot before continuing.
|
| 447 |
-
- Always use **normalized coordinates between 0.0 and 1.0**.
|
| 448 |
-
</web_form_guidelines>
|
| 449 |
-
|
| 450 |
-
<task_resolution_example>
|
| 451 |
-
For a task like "Sign up for an account and submit the form":
|
| 452 |
-
|
| 453 |
-
Step 1:
|
| 454 |
-
Short term goal: I want to open the signup page.
|
| 455 |
-
What I see: The browser is open on the homepage.
|
| 456 |
-
Reflection: I will open the signup URL directly.
|
| 457 |
-
Action:
|
| 458 |
-
```python
|
| 459 |
-
open("https://example.com/signup")
|
| 460 |
-
wait(3)
|
| 461 |
-
```<end_code>
|
| 462 |
-
|
| 463 |
-
Step 2:
|
| 464 |
-
Short term goal: I want to fill the "Email" field.
|
| 465 |
-
What I see: I see the signup form with an "Email" field roughly in the center-left of the screen.
|
| 466 |
-
Reflection: I will click inside the field (approximately 0.47, 0.30 in normalized coordinates) then type my email.
|
| 467 |
-
Action:
|
| 468 |
-
```python
|
| 469 |
-
click(0.47, 0.30)
|
| 470 |
-
write("[email protected]")
|
| 471 |
-
```<end_code>
|
| 472 |
-
|
| 473 |
-
Step 3:
|
| 474 |
-
Short term goal: I want to check the "I accept terms" checkbox.
|
| 475 |
-
What I see: The checkbox is in the lower portion of the form, around 0.45, 0.55 in normalized coordinates.
|
| 476 |
-
Reflection: I will click it.
|
| 477 |
-
Action:
|
| 478 |
-
```python
|
| 479 |
-
click(0.45, 0.55)
|
| 480 |
-
```<end_code>
|
| 481 |
-
|
| 482 |
-
Step 4:
|
| 483 |
-
Short term goal: I want to submit the form.
|
| 484 |
-
What I see: The "Sign Up" button is at the bottom center, around 0.52, 0.65 in normalized coordinates.
|
| 485 |
-
Reflection: I will click the button to submit.
|
| 486 |
-
Action:
|
| 487 |
-
```python
|
| 488 |
-
click(0.52, 0.65)
|
| 489 |
-
wait(3)
|
| 490 |
-
```<end_code>
|
| 491 |
-
|
| 492 |
-
Step 5:
|
| 493 |
-
Short term goal: Verify signup completed.
|
| 494 |
-
What I see: A confirmation page "Welcome [email protected]".
|
| 495 |
-
Reflection: Task complete.
|
| 496 |
-
Action:
|
| 497 |
-
```python
|
| 498 |
-
final_answer("Signup completed")
|
| 499 |
-
```<end_code>
|
| 500 |
-
</task_resolution_example>
|
| 501 |
-
|
| 502 |
-
<general_guidelines>
|
| 503 |
-
# GUI Agent Guidelines for Web Forms (Normalized Coordinates)
|
| 504 |
-
|
| 505 |
-
## Environment Overview
|
| 506 |
-
Ubuntu 22.04 XFCE4 desktop with Google Chrome/Chromium browser.
|
| 507 |
-
Agent can fill forms, sign up, sign in, click checkboxes, submit claims.
|
| 508 |
-
**All coordinates are normalized between 0.0 and 1.0.**
|
| 509 |
-
|
| 510 |
-
## Core Principles
|
| 511 |
-
|
| 512 |
-
### 1. Screenshot Analysis
|
| 513 |
-
- Always analyze the latest screenshot carefully before each action.
|
| 514 |
-
- Validate that previous actions worked by examining the current state.
|
| 515 |
-
- If an action didn't work, try an alternative rather than repeating blindly.
|
| 516 |
-
|
| 517 |
-
### 2. Action Execution
|
| 518 |
-
- Execute one action at a time.
|
| 519 |
-
- Wait for appropriate loading times using `wait()` but not indefinitely.
|
| 520 |
-
- Scroll to bring hidden elements into view.
|
| 521 |
-
|
| 522 |
-
### 3. Coordinate System
|
| 523 |
-
- **CRITICAL**: Always use normalized coordinates (0.0 to 1.0)
|
| 524 |
-
- Convert visual position on screen to normalized coordinates
|
| 525 |
-
- Center of screen = (0.5, 0.5)
|
| 526 |
-
- Top-left = (0.0, 0.0), Bottom-right = (1.0, 1.0)
|
| 527 |
-
|
| 528 |
-
### 4. Keyboard Shortcuts
|
| 529 |
-
- Use `tab` to move between fields, `space` to toggle checkboxes, `enter` to submit forms.
|
| 530 |
-
- Copy/paste: `ctrl+C`, `ctrl+V`.
|
| 531 |
-
- Refresh page: `refresh()`.
|
| 532 |
-
|
| 533 |
-
### 5. Error Recovery
|
| 534 |
-
- If clicking doesn't work, try double_click or right_click.
|
| 535 |
-
- If typing doesn't appear, ensure the field is focused with click.
|
| 536 |
-
- If popups block the screen, try `press("enter")` or `press("escape")`.
|
| 537 |
-
|
| 538 |
-
### 6. Security & Privacy
|
| 539 |
-
- Don't attempt to bypass captchas or 2FA automatically.
|
| 540 |
-
- Don't store credentials in plain text unless instructed.
|
| 541 |
-
|
| 542 |
-
### 7. Final Answer
|
| 543 |
-
- When the form is successfully submitted or the goal achieved, use:
|
| 544 |
-
```python
|
| 545 |
-
final_answer("Done")
|
| 546 |
-
```<end_code>
|
| 547 |
-
</general_guidelines>
|
| 548 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cua2-core/src/cua2-core/services/models/anthropic.py
DELETED
|
@@ -1,10 +0,0 @@
|
|
| 1 |
-
from smolagents import LiteLLMModel
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
class AnthropicModel(LiteLLMModel):
|
| 5 |
-
"""Anthropic model"""
|
| 6 |
-
|
| 7 |
-
MODEL_TYPE = "anthropic"
|
| 8 |
-
|
| 9 |
-
def __init__(self, model_id: str):
|
| 10 |
-
super().__init__(model_id=model_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cua2-core/src/cua2-core/services/models/gemini.py
DELETED
|
File without changes
|
cua2-core/src/cua2-core/services/models/get_model.py
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
from smolagents import Model
|
| 2 |
-
|
| 3 |
-
from backend.models.models import AgentType
|
| 4 |
-
from backend.services.models.anthropic import AnthropicModel
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
def get_model(model_id: str) -> tuple[Model, AgentType]:
|
| 8 |
-
"""Get the model"""
|
| 9 |
-
if "sonnet" in model_id:
|
| 10 |
-
return AnthropicModel(model_id=model_id), AgentType.PIXEL_COORDINATES
|
| 11 |
-
else:
|
| 12 |
-
raise ValueError(f"Model {model_id} not found")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cua2-core/src/cua2-core/services/models/qwen.py
DELETED
|
File without changes
|
cua2-core/src/{cua2-core β cua2_core}/__init__.py
RENAMED
|
File without changes
|
cua2-core/src/{cua2-core β cua2_core}/app.py
RENAMED
|
@@ -4,8 +4,8 @@ from dotenv import load_dotenv
|
|
| 4 |
from fastapi import FastAPI
|
| 5 |
from fastapi.middleware.cors import CORSMiddleware
|
| 6 |
|
| 7 |
-
from
|
| 8 |
-
from
|
| 9 |
|
| 10 |
# Load environment variables
|
| 11 |
load_dotenv()
|
|
|
|
| 4 |
from fastapi import FastAPI
|
| 5 |
from fastapi.middleware.cors import CORSMiddleware
|
| 6 |
|
| 7 |
+
from cua2_core.services.agent_service import AgentService
|
| 8 |
+
from cua2_core.websocket.websocket_manager import WebSocketManager
|
| 9 |
|
| 10 |
# Load environment variables
|
| 11 |
load_dotenv()
|
cua2-core/src/{cua2-core β cua2_core}/main.py
RENAMED
|
@@ -1,10 +1,9 @@
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
import uvicorn
|
| 4 |
-
|
| 5 |
-
from
|
| 6 |
-
from
|
| 7 |
-
from backend.routes.websocket import router as websocket_router
|
| 8 |
|
| 9 |
# Include routes
|
| 10 |
app.include_router(router, prefix="/api/v1")
|
|
@@ -14,7 +13,7 @@ app.include_router(websocket_router)
|
|
| 14 |
# Health check endpoint (without prefix)
|
| 15 |
@app.get("/health")
|
| 16 |
async def health():
|
| 17 |
-
return {"status": "healthy", "service": "
|
| 18 |
|
| 19 |
|
| 20 |
if __name__ == "__main__":
|
|
@@ -29,9 +28,10 @@ if __name__ == "__main__":
|
|
| 29 |
print(f"WebSocket endpoint: ws://{host}:{port}/ws")
|
| 30 |
|
| 31 |
uvicorn.run(
|
| 32 |
-
"
|
| 33 |
host=host,
|
| 34 |
port=port,
|
| 35 |
-
reload=debug,
|
|
|
|
| 36 |
log_level="info" if not debug else "debug",
|
| 37 |
)
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
import uvicorn
|
| 4 |
+
from cua2_core.app import app
|
| 5 |
+
from cua2_core.routes.routes import router
|
| 6 |
+
from cua2_core.routes.websocket import router as websocket_router
|
|
|
|
| 7 |
|
| 8 |
# Include routes
|
| 9 |
app.include_router(router, prefix="/api/v1")
|
|
|
|
| 13 |
# Health check endpoint (without prefix)
|
| 14 |
@app.get("/health")
|
| 15 |
async def health():
|
| 16 |
+
return {"status": "healthy", "service": "cua2-core"}
|
| 17 |
|
| 18 |
|
| 19 |
if __name__ == "__main__":
|
|
|
|
| 28 |
print(f"WebSocket endpoint: ws://{host}:{port}/ws")
|
| 29 |
|
| 30 |
uvicorn.run(
|
| 31 |
+
"cua2_core.app:app",
|
| 32 |
host=host,
|
| 33 |
port=port,
|
| 34 |
+
# reload=debug,
|
| 35 |
+
reload=True,
|
| 36 |
log_level="info" if not debug else "debug",
|
| 37 |
)
|
cua2-core/src/cua2_core/models/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Models module for CUA2 Core"""
|
| 2 |
+
|
cua2-core/src/cua2_core/models/models.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from enum import Enum
|
| 5 |
+
from typing import Annotated, Literal, TypeAlias
|
| 6 |
+
|
| 7 |
+
from pydantic import BaseModel, Field, field_serializer, model_validator
|
| 8 |
+
|
| 9 |
+
#################### Backend -> Frontend ########################
|
| 10 |
+
|
| 11 |
+
class AgentAction(BaseModel):
|
| 12 |
+
"""Agent action structure"""
|
| 13 |
+
|
| 14 |
+
actionType: Literal["click", "write", "press", "scroll", "wait", "open", "launch_app", "refresh", "go_back"]
|
| 15 |
+
actionArguments: dict
|
| 16 |
+
|
| 17 |
+
def to_string(self) -> str:
|
| 18 |
+
"""Convert action to a human-readable string"""
|
| 19 |
+
action_type = self.actionType
|
| 20 |
+
args = self.actionArguments
|
| 21 |
+
|
| 22 |
+
if action_type == "click":
|
| 23 |
+
x = args.get("x", "?")
|
| 24 |
+
y = args.get("y", "?")
|
| 25 |
+
return f"Click at coordinates ({x}, {y})"
|
| 26 |
+
|
| 27 |
+
elif action_type == "write":
|
| 28 |
+
text = args.get("text", "")
|
| 29 |
+
return f"Type text: '{text}'"
|
| 30 |
+
|
| 31 |
+
elif action_type == "press":
|
| 32 |
+
key = args.get("key", "")
|
| 33 |
+
return f"Press key: {key}"
|
| 34 |
+
|
| 35 |
+
elif action_type == "scroll":
|
| 36 |
+
direction = args.get("direction", "down")
|
| 37 |
+
amount = args.get("amount", 2)
|
| 38 |
+
return f"Scroll {direction} by {amount}"
|
| 39 |
+
|
| 40 |
+
elif action_type == "wait":
|
| 41 |
+
seconds = args.get("seconds", 0)
|
| 42 |
+
return f"Wait for {seconds} seconds"
|
| 43 |
+
|
| 44 |
+
elif action_type == "open":
|
| 45 |
+
file_or_url = args.get("file_or_url", "")
|
| 46 |
+
return f"Open: {file_or_url}"
|
| 47 |
+
|
| 48 |
+
elif action_type == "launch_app":
|
| 49 |
+
app_name = args.get("app_name", "")
|
| 50 |
+
return f"Launch app: {app_name}"
|
| 51 |
+
|
| 52 |
+
elif action_type == "refresh":
|
| 53 |
+
return "Refresh the current page"
|
| 54 |
+
|
| 55 |
+
elif action_type == "go_back":
|
| 56 |
+
return "Go back one page"
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class AgentStep(BaseModel):
|
| 60 |
+
"""Agent step structure"""
|
| 61 |
+
|
| 62 |
+
traceId: str
|
| 63 |
+
stepId: str
|
| 64 |
+
image: str
|
| 65 |
+
thought: str
|
| 66 |
+
actions: list[AgentAction]
|
| 67 |
+
timeTaken: float
|
| 68 |
+
inputTokensUsed: int
|
| 69 |
+
outputTokensUsed: int
|
| 70 |
+
timestamp: datetime
|
| 71 |
+
step_evaluation: Literal['like', 'dislike', 'neutral']
|
| 72 |
+
|
| 73 |
+
@field_serializer('actions')
|
| 74 |
+
def serialize_actions(self, actions: list[AgentAction], _info):
|
| 75 |
+
"""Convert actions to list of strings when dumping (controlled by context)"""
|
| 76 |
+
|
| 77 |
+
if _info.context and _info.context.get('actions_as_json', False):
|
| 78 |
+
return [action.model_dump(mode="json") for action in actions]
|
| 79 |
+
|
| 80 |
+
return [action.to_string() for action in actions]
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class AgentTraceMetadata(BaseModel):
|
| 84 |
+
"""Metadata for agent execution"""
|
| 85 |
+
|
| 86 |
+
traceId: str = ""
|
| 87 |
+
inputTokensUsed: int = 0
|
| 88 |
+
outputTokensUsed: int = 0
|
| 89 |
+
timeTaken: float = 0.0 # in seconds
|
| 90 |
+
numberOfSteps: int = 0
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class AgentTrace(BaseModel):
|
| 94 |
+
"""Agent message structure"""
|
| 95 |
+
|
| 96 |
+
id: str
|
| 97 |
+
timestamp: datetime
|
| 98 |
+
instruction: str
|
| 99 |
+
modelId: str
|
| 100 |
+
isRunning: bool
|
| 101 |
+
steps: list[AgentStep] = []
|
| 102 |
+
traceMetadata: AgentTraceMetadata = AgentTraceMetadata()
|
| 103 |
+
|
| 104 |
+
@model_validator(mode="after")
|
| 105 |
+
def validate_trace(self):
|
| 106 |
+
"""Validate trace"""
|
| 107 |
+
if not self.steps:
|
| 108 |
+
self.steps = []
|
| 109 |
+
if not self.traceMetadata:
|
| 110 |
+
self.traceMetadata = AgentTraceMetadata()
|
| 111 |
+
return self
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
#################### WebSocket Events ########################
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
class AgentStartEvent(BaseModel):
|
| 118 |
+
"""Agent start event"""
|
| 119 |
+
|
| 120 |
+
type: Literal["agent_start"] = "agent_start"
|
| 121 |
+
agentTrace: AgentTrace
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
class AgentProgressEvent(BaseModel):
|
| 125 |
+
"""Agent progress event"""
|
| 126 |
+
|
| 127 |
+
type: Literal["agent_progress"] = "agent_progress"
|
| 128 |
+
agentStep: AgentStep
|
| 129 |
+
traceMetadata: AgentTraceMetadata
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
class AgentCompleteEvent(BaseModel):
|
| 133 |
+
"""Agent complete event"""
|
| 134 |
+
|
| 135 |
+
type: Literal["agent_complete"] = "agent_complete"
|
| 136 |
+
traceMetadata: AgentTraceMetadata
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
class AgentErrorEvent(BaseModel):
|
| 140 |
+
"""Agent error event"""
|
| 141 |
+
|
| 142 |
+
type: Literal["agent_error"] = "agent_error"
|
| 143 |
+
error: str
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class VncUrlSetEvent(BaseModel):
|
| 147 |
+
"""Vnc url set event"""
|
| 148 |
+
|
| 149 |
+
type: Literal["vnc_url_set"] = "vnc_url_set"
|
| 150 |
+
vncUrl: str
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
class VncUrlUnsetEvent(BaseModel):
|
| 154 |
+
"""Vnc url unset event"""
|
| 155 |
+
|
| 156 |
+
type: Literal["vnc_url_unset"] = "vnc_url_unset"
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
class HeartbeatEvent(BaseModel):
|
| 160 |
+
"""Heartbeat event"""
|
| 161 |
+
|
| 162 |
+
type: Literal["heartbeat"] = "heartbeat"
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
WebSocketEvent: TypeAlias = Annotated[
|
| 166 |
+
AgentStartEvent
|
| 167 |
+
| AgentProgressEvent
|
| 168 |
+
| AgentCompleteEvent
|
| 169 |
+
| AgentErrorEvent
|
| 170 |
+
| VncUrlSetEvent
|
| 171 |
+
| VncUrlUnsetEvent
|
| 172 |
+
| HeartbeatEvent,
|
| 173 |
+
Field(discriminator="type"),
|
| 174 |
+
]
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
#################### Frontend -> Backend ########################
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
class UserTaskMessage(BaseModel):
|
| 181 |
+
"""Message sent from frontend to backend"""
|
| 182 |
+
|
| 183 |
+
event_type: Literal["user_task"]
|
| 184 |
+
agent_trace: AgentTrace | None = None
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
##################### Agent Service ########################
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
class ActiveTask(BaseModel):
|
| 191 |
+
"""Active task"""
|
| 192 |
+
|
| 193 |
+
message_id: str
|
| 194 |
+
instruction: str
|
| 195 |
+
modelId: str
|
| 196 |
+
timestamp: datetime = datetime.now()
|
| 197 |
+
steps: list[AgentStep] = []
|
| 198 |
+
traceMetadata: AgentTraceMetadata = AgentTraceMetadata()
|
| 199 |
+
|
| 200 |
+
@property
|
| 201 |
+
def trace_path(self):
|
| 202 |
+
"""Trace path"""
|
| 203 |
+
return f"data/trace-{self.message_id}-{self.modelId}"
|
| 204 |
+
|
| 205 |
+
@model_validator(mode="after")
|
| 206 |
+
def store_model(self):
|
| 207 |
+
"""Validate model ID"""
|
| 208 |
+
self.traceMetadata.traceId = self.message_id
|
| 209 |
+
os.makedirs(self.trace_path, exist_ok=True)
|
| 210 |
+
with open(f"{self.trace_path}/tasks.json", "w") as f:
|
| 211 |
+
json.dump(self.model_dump(mode="json", context={"actions_as_json": True}), f, indent=2)
|
| 212 |
+
|
| 213 |
+
return self
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
class HealthResponse(BaseModel):
|
| 217 |
+
"""Health check response"""
|
| 218 |
+
|
| 219 |
+
status: str
|
| 220 |
+
timestamp: datetime
|
| 221 |
+
websocket_connections: int
|
cua2-core/src/cua2_core/routes/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Routes module for CUA2 Core"""
|
| 2 |
+
|
cua2-core/src/{cua2-core β cua2_core}/routes/routes.py
RENAMED
|
@@ -3,9 +3,9 @@ from datetime import datetime
|
|
| 3 |
from fastapi import APIRouter, Depends, HTTPException, Request
|
| 4 |
|
| 5 |
# Get services from app state
|
| 6 |
-
from
|
| 7 |
-
from
|
| 8 |
-
from
|
| 9 |
|
| 10 |
# Create router
|
| 11 |
router = APIRouter()
|
|
|
|
| 3 |
from fastapi import APIRouter, Depends, HTTPException, Request
|
| 4 |
|
| 5 |
# Get services from app state
|
| 6 |
+
from cua2_core.models.models import HealthResponse
|
| 7 |
+
from cua2_core.services.agent_service import AgentService
|
| 8 |
+
from cua2_core.websocket.websocket_manager import WebSocketManager
|
| 9 |
|
| 10 |
# Create router
|
| 11 |
router = APIRouter()
|
cua2-core/src/{cua2-core β cua2_core}/routes/websocket.py
RENAMED
|
@@ -3,8 +3,8 @@ import json
|
|
| 3 |
from fastapi import APIRouter, WebSocket, WebSocketDisconnect
|
| 4 |
|
| 5 |
# Get services from app state
|
| 6 |
-
from
|
| 7 |
-
from
|
| 8 |
|
| 9 |
# Create router
|
| 10 |
router = APIRouter()
|
|
@@ -20,11 +20,8 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
| 20 |
await websocket_manager.connect(websocket)
|
| 21 |
|
| 22 |
try:
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
content="WebSocket connection established successfully",
|
| 26 |
-
messageId="connection_welcome",
|
| 27 |
-
)
|
| 28 |
await websocket_manager.send_personal_message(welcome_message, websocket)
|
| 29 |
|
| 30 |
# Keep the connection alive and wait for messages
|
|
@@ -36,27 +33,32 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
| 36 |
try:
|
| 37 |
# Parse the message
|
| 38 |
message_data = json.loads(data)
|
| 39 |
-
message
|
| 40 |
-
|
| 41 |
-
#
|
| 42 |
-
if
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
)
|
| 61 |
await websocket_manager.send_personal_message(
|
| 62 |
error_response, websocket
|
|
@@ -64,9 +66,12 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
| 64 |
|
| 65 |
except Exception as e:
|
| 66 |
print(f"Error processing message: {e}")
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
| 68 |
type="agent_error",
|
| 69 |
-
|
| 70 |
)
|
| 71 |
await websocket_manager.send_personal_message(
|
| 72 |
error_response, websocket
|
|
|
|
| 3 |
from fastapi import APIRouter, WebSocket, WebSocketDisconnect
|
| 4 |
|
| 5 |
# Get services from app state
|
| 6 |
+
from cua2_core.app import app
|
| 7 |
+
from cua2_core.models.models import UserTaskMessage, AgentTrace, HeartbeatEvent
|
| 8 |
|
| 9 |
# Create router
|
| 10 |
router = APIRouter()
|
|
|
|
| 20 |
await websocket_manager.connect(websocket)
|
| 21 |
|
| 22 |
try:
|
| 23 |
+
# Send welcome heartbeat
|
| 24 |
+
welcome_message = HeartbeatEvent(type="heartbeat")
|
|
|
|
|
|
|
|
|
|
| 25 |
await websocket_manager.send_personal_message(welcome_message, websocket)
|
| 26 |
|
| 27 |
# Keep the connection alive and wait for messages
|
|
|
|
| 33 |
try:
|
| 34 |
# Parse the message
|
| 35 |
message_data = json.loads(data)
|
| 36 |
+
print(f"Received message: {message_data}")
|
| 37 |
+
|
| 38 |
+
# Check if it's a user task message
|
| 39 |
+
if message_data.get("type") == "user_task":
|
| 40 |
+
# Extract and parse the trace
|
| 41 |
+
trace_data = message_data.get("trace")
|
| 42 |
+
if trace_data:
|
| 43 |
+
# Convert timestamp string to datetime if needed
|
| 44 |
+
if isinstance(trace_data.get("timestamp"), str):
|
| 45 |
+
from datetime import datetime
|
| 46 |
+
trace_data["timestamp"] = datetime.fromisoformat(trace_data["timestamp"].replace("Z", "+00:00"))
|
| 47 |
+
|
| 48 |
+
trace = AgentTrace(**trace_data)
|
| 49 |
+
|
| 50 |
+
# Process the user task with the trace
|
| 51 |
+
trace_id = await agent_service.process_user_task(trace)
|
| 52 |
+
print(f"Started processing trace: {trace_id}")
|
| 53 |
+
else:
|
| 54 |
+
print("No trace data in message")
|
| 55 |
+
|
| 56 |
+
except json.JSONDecodeError as e:
|
| 57 |
+
print(f"JSON decode error: {e}")
|
| 58 |
+
from cua2_core.models.models import AgentErrorEvent
|
| 59 |
+
error_response = AgentErrorEvent(
|
| 60 |
+
type="agent_error",
|
| 61 |
+
error="Invalid JSON format"
|
| 62 |
)
|
| 63 |
await websocket_manager.send_personal_message(
|
| 64 |
error_response, websocket
|
|
|
|
| 66 |
|
| 67 |
except Exception as e:
|
| 68 |
print(f"Error processing message: {e}")
|
| 69 |
+
import traceback
|
| 70 |
+
traceback.print_exc()
|
| 71 |
+
from cua2_core.models.models import AgentErrorEvent
|
| 72 |
+
error_response = AgentErrorEvent(
|
| 73 |
type="agent_error",
|
| 74 |
+
error=f"Error processing message: {str(e)}"
|
| 75 |
)
|
| 76 |
await websocket_manager.send_personal_message(
|
| 77 |
error_response, websocket
|
cua2-core/src/cua2_core/services/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Services module for CUA2 Core"""
|
| 2 |
+
|
cua2-core/src/cua2_core/services/agent_service.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import json
|
| 3 |
+
import base64
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
+
from cua2_core.models.models import (
|
| 9 |
+
ActiveTask,
|
| 10 |
+
AgentTrace,
|
| 11 |
+
AgentStep,
|
| 12 |
+
AgentAction,
|
| 13 |
+
AgentTraceMetadata,
|
| 14 |
+
AgentStartEvent,
|
| 15 |
+
AgentProgressEvent,
|
| 16 |
+
AgentCompleteEvent,
|
| 17 |
+
AgentErrorEvent,
|
| 18 |
+
VncUrlSetEvent,
|
| 19 |
+
VncUrlUnsetEvent,
|
| 20 |
+
)
|
| 21 |
+
from cua2_core.websocket.websocket_manager import WebSocketManager
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class AgentService:
|
| 25 |
+
"""Service for handling agent tasks and processing"""
|
| 26 |
+
|
| 27 |
+
def __init__(self, websocket_manager):
|
| 28 |
+
self.active_tasks: dict[str, ActiveTask] = {}
|
| 29 |
+
self.websocket_manager: WebSocketManager = websocket_manager
|
| 30 |
+
self.simulation_data_path = Path(__file__).parent / "simulation_metadata" / "simulated_trace.json"
|
| 31 |
+
self.simulation_images_path = Path(__file__).parent / "simulation_metadata" / "images"
|
| 32 |
+
|
| 33 |
+
async def process_user_task(self, trace: AgentTrace) -> str:
|
| 34 |
+
"""Process a user task and return the trace ID"""
|
| 35 |
+
|
| 36 |
+
trace_id = trace.id
|
| 37 |
+
trace.steps = []
|
| 38 |
+
trace.traceMetadata = AgentTraceMetadata(traceId=trace_id)
|
| 39 |
+
|
| 40 |
+
# Store the task
|
| 41 |
+
self.active_tasks[trace_id] = ActiveTask(
|
| 42 |
+
message_id=trace_id,
|
| 43 |
+
instruction=trace.instruction,
|
| 44 |
+
modelId=trace.modelId,
|
| 45 |
+
timestamp=trace.timestamp,
|
| 46 |
+
steps=trace.steps,
|
| 47 |
+
traceMetadata=trace.traceMetadata,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# Start the agent processing in the background
|
| 51 |
+
asyncio.create_task(
|
| 52 |
+
self._simulate_agent_processing(trace)
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
return trace_id
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
async def _simulate_agent_processing(self, trace: AgentTrace):
|
| 59 |
+
"""Simulate agent processing using simulated_trace.json data"""
|
| 60 |
+
trace_id = trace.id
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
# Load simulation data
|
| 64 |
+
with open(self.simulation_data_path, 'r') as f:
|
| 65 |
+
simulation_data = json.load(f)
|
| 66 |
+
|
| 67 |
+
# Send agent start event with the initial trace
|
| 68 |
+
start_event = AgentStartEvent(
|
| 69 |
+
type="agent_start",
|
| 70 |
+
agentTrace=trace
|
| 71 |
+
)
|
| 72 |
+
await self.websocket_manager.broadcast(start_event)
|
| 73 |
+
|
| 74 |
+
# mock VNC URL
|
| 75 |
+
vnc_url = "https://www.youtube.com/embed/VCutEsRSJ5A?si=PT0ETJ7zIJ9ywhGW"
|
| 76 |
+
vnc_set_event = VncUrlSetEvent(
|
| 77 |
+
type="vnc_url_set",
|
| 78 |
+
vncUrl=vnc_url
|
| 79 |
+
)
|
| 80 |
+
await self.websocket_manager.broadcast(vnc_set_event)
|
| 81 |
+
|
| 82 |
+
trace_metadata = AgentTraceMetadata(traceId=trace_id)
|
| 83 |
+
|
| 84 |
+
# Process each step from the simulation data
|
| 85 |
+
for step_data in simulation_data["steps"]:
|
| 86 |
+
# Wait before sending the next step to simulate processing time
|
| 87 |
+
await asyncio.sleep(step_data["timeTaken"])
|
| 88 |
+
|
| 89 |
+
# Load and encode the image
|
| 90 |
+
image_path = self.simulation_images_path / step_data["image"].split("/")[-1]
|
| 91 |
+
with open(image_path, 'rb') as img_file:
|
| 92 |
+
image_bytes = img_file.read()
|
| 93 |
+
image_base64 = f"data:image/png;base64,{base64.b64encode(image_bytes).decode('utf-8')}"
|
| 94 |
+
|
| 95 |
+
# Convert actions to AgentAction objects
|
| 96 |
+
actions = [
|
| 97 |
+
AgentAction(
|
| 98 |
+
actionType=action["actionType"],
|
| 99 |
+
actionArguments=action["actionArguments"]
|
| 100 |
+
)
|
| 101 |
+
for action in step_data["actions"]
|
| 102 |
+
]
|
| 103 |
+
|
| 104 |
+
# Create agent step
|
| 105 |
+
agent_step = AgentStep(
|
| 106 |
+
traceId=trace_id,
|
| 107 |
+
stepId=step_data["stepId"],
|
| 108 |
+
image=image_base64,
|
| 109 |
+
thought=step_data["thought"],
|
| 110 |
+
actions=actions,
|
| 111 |
+
timeTaken=step_data["timeTaken"],
|
| 112 |
+
inputTokensUsed=step_data["inputTokensUsed"],
|
| 113 |
+
outputTokensUsed=step_data["outputTokensUsed"],
|
| 114 |
+
timestamp=datetime.fromisoformat(step_data["timestamp"].replace("Z", "+00:00")),
|
| 115 |
+
step_evaluation=step_data["step_evaluation"]
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
trace_metadata.numberOfSteps += 1
|
| 119 |
+
trace_metadata.timeTaken += step_data["timeTaken"]
|
| 120 |
+
trace_metadata.inputTokensUsed += step_data["inputTokensUsed"]
|
| 121 |
+
trace_metadata.outputTokensUsed += step_data["outputTokensUsed"]
|
| 122 |
+
|
| 123 |
+
# Send progress event
|
| 124 |
+
progress_event = AgentProgressEvent(
|
| 125 |
+
type="agent_progress",
|
| 126 |
+
agentStep=agent_step,
|
| 127 |
+
traceMetadata=trace_metadata
|
| 128 |
+
)
|
| 129 |
+
await self.websocket_manager.broadcast(progress_event)
|
| 130 |
+
|
| 131 |
+
# Update active task
|
| 132 |
+
self.active_tasks[trace_id].steps.append(agent_step)
|
| 133 |
+
|
| 134 |
+
# Unset VNC URL before completion
|
| 135 |
+
vnc_unset_event = VncUrlUnsetEvent(type="vnc_url_unset")
|
| 136 |
+
await self.websocket_manager.broadcast(vnc_unset_event)
|
| 137 |
+
|
| 138 |
+
# Send completion event
|
| 139 |
+
complete_event = AgentCompleteEvent(
|
| 140 |
+
type="agent_complete",
|
| 141 |
+
traceMetadata=trace_metadata
|
| 142 |
+
)
|
| 143 |
+
await self.websocket_manager.broadcast(complete_event)
|
| 144 |
+
|
| 145 |
+
# Update active task with final metadata
|
| 146 |
+
self.active_tasks[trace_id].traceMetadata = trace_metadata
|
| 147 |
+
|
| 148 |
+
# Clean up after a delay
|
| 149 |
+
await asyncio.sleep(1)
|
| 150 |
+
if trace_id in self.active_tasks:
|
| 151 |
+
del self.active_tasks[trace_id]
|
| 152 |
+
|
| 153 |
+
except Exception as e:
|
| 154 |
+
print(f"Error in agent simulation: {str(e)}")
|
| 155 |
+
# Send error event
|
| 156 |
+
error_event = AgentErrorEvent(
|
| 157 |
+
type="agent_error",
|
| 158 |
+
error=f"Error processing task: {str(e)}"
|
| 159 |
+
)
|
| 160 |
+
await self.websocket_manager.broadcast(error_event)
|
| 161 |
+
|
| 162 |
+
# Clean up
|
| 163 |
+
if trace_id in self.active_tasks:
|
| 164 |
+
del self.active_tasks[trace_id]
|
| 165 |
+
|
| 166 |
+
def get_active_tasks(self) -> dict:
|
| 167 |
+
"""Get currently active tasks"""
|
| 168 |
+
return self.active_tasks.copy()
|
| 169 |
+
|
| 170 |
+
def get_task_status(self, message_id: str) -> Optional[dict]:
|
| 171 |
+
"""Get status of a specific task"""
|
| 172 |
+
return self.active_tasks.get(message_id)
|
cua2-core/src/cua2_core/services/simulation_metadata/simulated_trace.json
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"steps": [
|
| 3 |
+
{
|
| 4 |
+
"stepId": "step_001",
|
| 5 |
+
"image": "images/step_1.png",
|
| 6 |
+
"thought": "I can see a form with multiple input fields. I need to start by clicking on the first name field to begin filling out the form.",
|
| 7 |
+
"actions": [
|
| 8 |
+
{
|
| 9 |
+
"actionType": "click",
|
| 10 |
+
"actionArguments": {
|
| 11 |
+
"x": 320,
|
| 12 |
+
"y": 180
|
| 13 |
+
}
|
| 14 |
+
}
|
| 15 |
+
],
|
| 16 |
+
"timeTaken": 2.3,
|
| 17 |
+
"inputTokensUsed": 1250,
|
| 18 |
+
"outputTokensUsed": 85,
|
| 19 |
+
"timestamp": "2025-10-17T14:30:02.300Z",
|
| 20 |
+
"step_evaluation": "like"
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"stepId": "step_002",
|
| 24 |
+
"image": "images/step_2.png",
|
| 25 |
+
"thought": "After clicking the first field, I can see the cursor is active. Now I should proceed to click on the email field to continue with the form submission process.",
|
| 26 |
+
"actions": [
|
| 27 |
+
{
|
| 28 |
+
"actionType": "click",
|
| 29 |
+
"actionArguments": {
|
| 30 |
+
"x": 420,
|
| 31 |
+
"y": 285
|
| 32 |
+
}
|
| 33 |
+
}
|
| 34 |
+
],
|
| 35 |
+
"timeTaken": 1.8,
|
| 36 |
+
"inputTokensUsed": 1180,
|
| 37 |
+
"outputTokensUsed": 72,
|
| 38 |
+
"timestamp": "2025-10-17T14:30:04.100Z",
|
| 39 |
+
"step_evaluation": "like"
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"stepId": "step_003",
|
| 43 |
+
"image": "images/step_3.png",
|
| 44 |
+
"thought": "The form appears to be mostly filled. I can see a submit button at the bottom of the form. I'll click on it to complete the form submission.",
|
| 45 |
+
"actions": [
|
| 46 |
+
{
|
| 47 |
+
"actionType": "click",
|
| 48 |
+
"actionArguments": {
|
| 49 |
+
"x": 450,
|
| 50 |
+
"y": 520
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
],
|
| 54 |
+
"timeTaken": 1.5,
|
| 55 |
+
"inputTokensUsed": 1100,
|
| 56 |
+
"outputTokensUsed": 68,
|
| 57 |
+
"timestamp": "2025-10-17T14:30:05.600Z",
|
| 58 |
+
"step_evaluation": "like"
|
| 59 |
+
}
|
| 60 |
+
]
|
| 61 |
+
}
|
| 62 |
+
|
cua2-core/src/cua2_core/websocket/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""WebSocket module for CUA2 Core"""
|
| 2 |
+
|
cua2-core/src/{cua2-core β cua2_core}/websocket/websocket_manager.py
RENAMED
|
@@ -4,7 +4,7 @@ from typing import Dict, Optional, Set
|
|
| 4 |
|
| 5 |
from fastapi import WebSocket
|
| 6 |
|
| 7 |
-
from
|
| 8 |
|
| 9 |
|
| 10 |
class WebSocketManager:
|
|
@@ -35,7 +35,7 @@ class WebSocketManager:
|
|
| 35 |
):
|
| 36 |
"""Send a message to a specific WebSocket connection"""
|
| 37 |
try:
|
| 38 |
-
await websocket.send_text(json.dumps(message.model_dump()))
|
| 39 |
except Exception as e:
|
| 40 |
print(f"Error sending personal message: {e}")
|
| 41 |
# Only disconnect if the connection is still in our set
|
|
@@ -52,7 +52,7 @@ class WebSocketManager:
|
|
| 52 |
|
| 53 |
for connection in self.active_connections.copy():
|
| 54 |
try:
|
| 55 |
-
await connection.send_text(json.dumps(message.model_dump()))
|
| 56 |
except Exception as e:
|
| 57 |
print(f"Error broadcasting to connection: {e}")
|
| 58 |
disconnected.append(connection)
|
|
@@ -77,7 +77,7 @@ class WebSocketManager:
|
|
| 77 |
await self.broadcast(event)
|
| 78 |
|
| 79 |
async def send_agent_complete(
|
| 80 |
-
self, content: str, message_id: str, metadata: Optional[
|
| 81 |
):
|
| 82 |
"""Send agent complete event"""
|
| 83 |
event = WebSocketEvent(
|
|
|
|
| 4 |
|
| 5 |
from fastapi import WebSocket
|
| 6 |
|
| 7 |
+
from cua2_core.models.models import AgentTraceMetadata, WebSocketEvent
|
| 8 |
|
| 9 |
|
| 10 |
class WebSocketManager:
|
|
|
|
| 35 |
):
|
| 36 |
"""Send a message to a specific WebSocket connection"""
|
| 37 |
try:
|
| 38 |
+
await websocket.send_text(json.dumps(message.model_dump(mode="json")))
|
| 39 |
except Exception as e:
|
| 40 |
print(f"Error sending personal message: {e}")
|
| 41 |
# Only disconnect if the connection is still in our set
|
|
|
|
| 52 |
|
| 53 |
for connection in self.active_connections.copy():
|
| 54 |
try:
|
| 55 |
+
await connection.send_text(json.dumps(message.model_dump(mode="json")))
|
| 56 |
except Exception as e:
|
| 57 |
print(f"Error broadcasting to connection: {e}")
|
| 58 |
disconnected.append(connection)
|
|
|
|
| 77 |
await self.broadcast(event)
|
| 78 |
|
| 79 |
async def send_agent_complete(
|
| 80 |
+
self, content: str, message_id: str, metadata: Optional[AgentTraceMetadata] = None
|
| 81 |
):
|
| 82 |
"""Send agent complete event"""
|
| 83 |
event = WebSocketEvent(
|