Amir Mahla commited on
Commit
c9554cf
Β·
1 Parent(s): af1ae43

MOCK backend

Browse files
Files changed (25) hide show
  1. cua2-core/pyproject.toml +1 -3
  2. cua2-core/src/cua2-core/models/models.py +0 -95
  3. cua2-core/src/cua2-core/services/agent_service.py +0 -130
  4. cua2-core/src/cua2-core/services/agents/get_agents.py +0 -57
  5. cua2-core/src/cua2-core/services/agents/normalized_1000_agent.py +0 -293
  6. cua2-core/src/cua2-core/services/agents/normalized_agent.py +0 -282
  7. cua2-core/src/cua2-core/services/agents/pixel_coordonates_agent.py +0 -317
  8. cua2-core/src/cua2-core/services/agents/prompt.py +0 -548
  9. cua2-core/src/cua2-core/services/models/anthropic.py +0 -10
  10. cua2-core/src/cua2-core/services/models/gemini.py +0 -0
  11. cua2-core/src/cua2-core/services/models/get_model.py +0 -12
  12. cua2-core/src/cua2-core/services/models/qwen.py +0 -0
  13. cua2-core/src/{cua2-core β†’ cua2_core}/__init__.py +0 -0
  14. cua2-core/src/{cua2-core β†’ cua2_core}/app.py +2 -2
  15. cua2-core/src/{cua2-core β†’ cua2_core}/main.py +7 -7
  16. cua2-core/src/cua2_core/models/__init__.py +2 -0
  17. cua2-core/src/cua2_core/models/models.py +221 -0
  18. cua2-core/src/cua2_core/routes/__init__.py +2 -0
  19. cua2-core/src/{cua2-core β†’ cua2_core}/routes/routes.py +3 -3
  20. cua2-core/src/{cua2-core β†’ cua2_core}/routes/websocket.py +35 -30
  21. cua2-core/src/cua2_core/services/__init__.py +2 -0
  22. cua2-core/src/cua2_core/services/agent_service.py +172 -0
  23. cua2-core/src/cua2_core/services/simulation_metadata/simulated_trace.json +62 -0
  24. cua2-core/src/cua2_core/websocket/__init__.py +2 -0
  25. cua2-core/src/{cua2-core β†’ cua2_core}/websocket/websocket_manager.py +4 -4
cua2-core/pyproject.toml CHANGED
@@ -6,7 +6,6 @@ build-backend = "hatchling.build"
6
  name = "cua2-core"
7
  version = "0.0.0-dev.0"
8
  description = "Backend API server for Computer Use Agent"
9
- readme = "README.md"
10
  authors = [{ name = "Amir Mahla", email = "[email protected]" }]
11
  keywords = ["fastapi", "api", "backend", "automation"]
12
  classifiers = [
@@ -61,12 +60,11 @@ Homepage = "https://github.com/huggingface/CUA2"
61
  Repository = "https://github.com/huggingface/CUA2"
62
 
63
  [tool.hatch.build.targets.wheel]
64
- packages = ["src/cua2-core"]
65
 
66
  [tool.hatch.build.targets.sdist]
67
  include = [
68
  "/src",
69
- "/README.md",
70
  ]
71
 
72
  [tool.coverage.run]
 
6
  name = "cua2-core"
7
  version = "0.0.0-dev.0"
8
  description = "Backend API server for Computer Use Agent"
 
9
  authors = [{ name = "Amir Mahla", email = "[email protected]" }]
10
  keywords = ["fastapi", "api", "backend", "automation"]
11
  classifiers = [
 
60
  Repository = "https://github.com/huggingface/CUA2"
61
 
62
  [tool.hatch.build.targets.wheel]
63
+ packages = ["src/cua2_core"]
64
 
65
  [tool.hatch.build.targets.sdist]
66
  include = [
67
  "/src",
 
68
  ]
69
 
70
  [tool.coverage.run]
cua2-core/src/cua2-core/models/models.py DELETED
@@ -1,95 +0,0 @@
1
- import json
2
- import os
3
- from datetime import datetime
4
- from enum import Enum
5
- from typing import Literal, Optional
6
-
7
- from pydantic import BaseModel, model_validator
8
-
9
-
10
- class AgentMetadata(BaseModel):
11
- """Metadata for agent execution"""
12
-
13
- inputTokensUsed: int
14
- outputTokensUsed: int
15
- timeTaken: float # in seconds
16
- numberOfSteps: int
17
-
18
-
19
- class AgentType(str, Enum):
20
- """Agent type"""
21
-
22
- PIXEL_COORDINATES = "pixel_coordinates"
23
- NORMALIZED_1000_COORDINATES = "normalized_1000_coordinates"
24
- NORMALIZED_COORDINATES = "normalized_coordinates"
25
-
26
-
27
- class ActiveTask(BaseModel):
28
- """Active task"""
29
-
30
- message_id: str
31
- content: str
32
- model_id: str
33
- start_time: datetime
34
- status: str
35
-
36
- @property
37
- def trace_path(self):
38
- """Trace path"""
39
- return f"data/trace-{self.message_id}-{self.model_id}"
40
-
41
- @model_validator(mode="after")
42
- def validate_model_id(self):
43
- """Validate model ID"""
44
- os.makedirs(self.trace_path, exist_ok=True)
45
- with open(f"{self.trace_path}/user_tasks.json", "w") as f:
46
- json.dump(self.model_dump(mode="json"), f, indent=2)
47
-
48
- return self
49
-
50
-
51
- class WebSocketEvent(BaseModel):
52
- """WebSocket event structure"""
53
-
54
- type: Literal[
55
- "agent_start",
56
- "agent_progress",
57
- "agent_complete",
58
- "agent_error",
59
- "vnc_url_set",
60
- "vnc_url_unset",
61
- "heartbeat",
62
- ]
63
- content: Optional[str] = None
64
- metadata: Optional[AgentMetadata] = None
65
- messageId: Optional[str] = None
66
- vncUrl: Optional[str] = None
67
-
68
-
69
- class UserTaskMessage(BaseModel):
70
- """Message sent from frontend to backend"""
71
-
72
- type: Literal["user_task"]
73
- content: str
74
- model_id: str
75
- timestamp: str
76
-
77
-
78
- class AgentMessage(BaseModel):
79
- """Agent message structure"""
80
-
81
- id: str
82
- type: Literal["user", "agent"]
83
- content: str
84
- timestamp: datetime
85
- metadata: Optional[AgentMetadata] = None
86
- isLoading: Optional[bool] = None
87
- truncated: Optional[bool] = None
88
-
89
-
90
- class HealthResponse(BaseModel):
91
- """Health check response"""
92
-
93
- status: str
94
- timestamp: datetime
95
- websocket_connections: int
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cua2-core/src/cua2-core/services/agent_service.py DELETED
@@ -1,130 +0,0 @@
1
- import asyncio
2
- import uuid
3
- from datetime import datetime
4
- from typing import Optional
5
-
6
- from smolagents import Model
7
-
8
- from backend.models.models import ActiveTask, AgentMetadata
9
- from backend.services.agents.get_agents import get_agent
10
- from backend.services.models.get_model import get_model
11
- from backend.websocket.websocket_manager import WebSocketManager
12
- from computer_use_studio import Sandbox
13
- from computer_use_studio.logger import get_logger
14
-
15
- logger = get_logger(__name__)
16
-
17
-
18
- class AgentService:
19
- """Service for handling agent tasks and processing"""
20
-
21
- def __init__(self, websocket_manager):
22
- self.active_tasks: dict[str, ActiveTask] = {}
23
- self.websocket_manager: WebSocketManager = websocket_manager
24
-
25
- async def process_user_task(self, content: str, model_id: str) -> str:
26
- """Process a user task and return the message ID"""
27
-
28
- message_id = str(uuid.uuid4())
29
- while message_id in self.active_tasks.keys():
30
- message_id = str(uuid.uuid4())
31
-
32
- # Store the task
33
- self.active_tasks[message_id] = ActiveTask(
34
- message_id=message_id,
35
- content=content,
36
- model_id=model_id,
37
- start_time=datetime.now(),
38
- status="processing",
39
- )
40
-
41
- # Determine the agent type based on the content of the task (TODO: implement agent type detection using LLM)
42
- prompt_type = "FORM_SYSTEM_PROMPT"
43
-
44
- # Start the agent processing in the background
45
- asyncio.create_task(
46
- self._simulate_agent_processing(content, model_id, message_id, prompt_type)
47
- )
48
-
49
- return message_id
50
-
51
-
52
- # async def _simulate_agent_processing(self, message_id: str, content: str):
53
- # """Simulate agent processing with progress updates"""
54
- # try:
55
- # # Send agent start event
56
- # await self.websocket_manager.send_agent_start(
57
- # content=f"Starting task: {content}", message_id=message_id
58
- # )
59
- #
60
- # # Simulate processing steps
61
- # steps = [
62
- # "Analyzing task requirements...",
63
- # "Planning execution steps...",
64
- # "Initializing computer interface...",
65
- # "Executing task commands...",
66
- # "Verifying results...",
67
- # "Finalizing task completion...",
68
- # ]
69
- #
70
- # for i, step in enumerate(steps):
71
- # await asyncio.sleep(2) # Simulate processing time
72
- #
73
- # # Send progress update
74
- # await self.websocket_manager.send_agent_progress(
75
- # content=f"{step} ({i + 1}/{len(steps)})", message_id=message_id
76
- # )
77
- #
78
- # # Simulate VNC URL events during processing
79
- # if i == 2: # After "Initializing computer interface..."
80
- # # Set VNC URL when computer interface is ready
81
- # vnc_url = "http://localhost:6080/vnc.html?host=localhost&port=5900&autoconnect=true"
82
- # await self.websocket_manager.send_vnc_url_set(
83
- # vnc_url=vnc_url,
84
- # content="Computer interface ready, VNC stream connected",
85
- # )
86
- # elif i == 4: # After "Verifying results..."
87
- # # Unset VNC URL when task is almost complete
88
- # await self.websocket_manager.send_vnc_url_unset(
89
- # content="Task verification complete, disconnecting VNC stream"
90
- # )
91
- #
92
- # # Calculate metadata
93
- # end_time = datetime.now()
94
- # start_time = self.active_tasks[message_id]["start_time"]
95
- # time_taken = (end_time - start_time).total_seconds()
96
- #
97
- # metadata = AgentMetadata(
98
- # tokensUsed=150 + len(content) * 2, # Simulate token usage
99
- # timeTaken=time_taken,
100
- # numberOfSteps=len(steps),
101
- # )
102
- #
103
- # # Send completion event
104
- # await self.websocket_manager.send_agent_complete(
105
- # content=f"Task completed successfully: {content}",
106
- # message_id=message_id,
107
- # metadata=metadata,
108
- # )
109
- #
110
- # # Clean up
111
- # if message_id in self.active_tasks:
112
- # del self.active_tasks[message_id]
113
- #
114
- # except Exception as e:
115
- # # Send error event
116
- # await self.websocket_manager.send_agent_error(
117
- # content=f"Error processing task: {str(e)}", message_id=message_id
118
- # )
119
- #
120
- # # Clean up
121
- # if message_id in self.active_tasks:
122
- # del self.active_tasks[message_id]
123
-
124
- def get_active_tasks(self) -> dict:
125
- """Get currently active tasks"""
126
- return self.active_tasks.copy()
127
-
128
- def get_task_status(self, message_id: str) -> Optional[dict]:
129
- """Get status of a specific task"""
130
- return self.active_tasks.get(message_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cua2-core/src/cua2-core/services/agents/get_agents.py DELETED
@@ -1,57 +0,0 @@
1
- from typing import Annotated, TypeAlias
2
-
3
- from pydantic import Field
4
- from smolagents import Model
5
-
6
- from backend.models.models import AgentType
7
- from backend.services.agents.normalized_1000_agent import Normalized1000Agent
8
- from backend.services.agents.normalized_agent import NormalizedAgent
9
- from backend.services.agents.pixel_coordonates_agent import PixelCoordinatesAgent
10
- from backend.services.agents.prompt import (
11
- Normalized1000CoordinatesSystemPrompt,
12
- NormalizedCoordinatesSystemPrompt,
13
- PixelCoordinatesSystemPrompt,
14
- )
15
- from computer_use_studio import Sandbox
16
-
17
- Agent: TypeAlias = Annotated[
18
- PixelCoordinatesAgent | Normalized1000Agent | NormalizedAgent,
19
- Field(discriminator="AGENT_TYPE"),
20
- ]
21
-
22
-
23
- def get_agent(
24
- model: Model,
25
- desktop: Sandbox,
26
- agent_type: AgentType,
27
- prompt_type: str,
28
- data_dir: str,
29
- **kwargs,
30
- ) -> Agent:
31
- """Get the agent by type"""
32
- if agent_type == AgentType.PIXEL_COORDINATES:
33
- return PixelCoordinatesAgent(
34
- model=model,
35
- desktop=desktop,
36
- system_prompt=PixelCoordinatesSystemPrompt[prompt_type].value,
37
- data_dir=data_dir,
38
- **kwargs,
39
- )
40
- elif agent_type == AgentType.NORMALIZED_1000_COORDINATES:
41
- return Normalized1000Agent(
42
- model=model,
43
- desktop=desktop,
44
- system_prompt=Normalized1000CoordinatesSystemPrompt[prompt_type].value,
45
- data_dir=data_dir,
46
- **kwargs,
47
- )
48
- elif agent_type == AgentType.NORMALIZED_COORDINATES:
49
- return Normalized1000Agent(
50
- model=model,
51
- desktop=desktop,
52
- system_prompt=NormalizedCoordinatesSystemPrompt[prompt_type].value,
53
- data_dir=data_dir,
54
- **kwargs,
55
- )
56
- else:
57
- raise ValueError(f"Invalid agent type: {agent_type}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cua2-core/src/cua2-core/services/agents/normalized_1000_agent.py DELETED
@@ -1,293 +0,0 @@
1
- import time
2
- import unicodedata
3
- from typing import List, Literal
4
-
5
- # SmolaAgents imports
6
- from smolagents import Model, Tool, tool
7
- from smolagents.monitoring import LogLevel
8
-
9
- from backend.models.models import AgentType
10
- from backend.services.agents.prompt import Normalized1000CoordinatesSystemPrompt
11
- from computer_use_studio import DesktopAgentBase, Sandbox
12
-
13
-
14
- class Normalized1000Agent(DesktopAgentBase):
15
- """Agent for desktop automation with normalized coordinates (0 to 1000)"""
16
-
17
- AGENT_TYPE = AgentType.NORMALIZED_1000_COORDINATES
18
-
19
- def __init__(
20
- self,
21
- model: Model,
22
- data_dir: str,
23
- desktop: Sandbox,
24
- system_prompt: Normalized1000CoordinatesSystemPrompt,
25
- tools: List[Tool] | None = None,
26
- max_steps: int = 20,
27
- verbosity_level: LogLevel = LogLevel.INFO,
28
- planning_interval: int | None = None,
29
- use_v1_prompt: bool = False,
30
- **kwargs,
31
- ):
32
- super().__init__(
33
- model=model,
34
- data_dir=data_dir,
35
- desktop=desktop,
36
- system_prompt=system_prompt,
37
- tools=tools,
38
- max_steps=max_steps,
39
- verbosity_level=verbosity_level,
40
- planning_interval=planning_interval,
41
- use_v1_prompt=use_v1_prompt,
42
- **kwargs,
43
- )
44
-
45
- def _normalize_to_pixel(self, norm_x: int, norm_y: int) -> tuple[int, int]:
46
- """
47
- Convert normalized coordinates (0-1000) to pixel coordinates
48
- Args:
49
- norm_x: Normalized x coordinate (0 to 1000)
50
- norm_y: Normalized y coordinate (0 to 1000)
51
- Returns:
52
- Tuple of (pixel_x, pixel_y)
53
- """
54
- # Clamp values to valid range
55
- norm_x = max(0, min(1000, norm_x))
56
- norm_y = max(0, min(1000, norm_y))
57
-
58
- # Convert from 0-1000 range to 0-1 range, then to pixels
59
- norm_x_float = norm_x / 1000.0
60
- norm_y_float = norm_y / 1000.0
61
-
62
- pixel_x = int(norm_x_float * self.width)
63
- pixel_y = int(norm_y_float * self.height)
64
-
65
- # Ensure we don't go outside screen bounds
66
- pixel_x = max(0, min(self.width - 1, pixel_x))
67
- pixel_y = max(0, min(self.height - 1, pixel_y))
68
-
69
- return pixel_x, pixel_y
70
-
71
- def _setup_desktop_tools(self):
72
- """Register all desktop tools with normalized coordinate support (0-1000)"""
73
-
74
- @tool
75
- def click(x: int, y: int) -> str:
76
- """
77
- Performs a left-click at the specified normalized coordinates
78
- Args:
79
- x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
80
- y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
81
- """
82
- pixel_x, pixel_y = self._normalize_to_pixel(x, y)
83
- self.desktop.left_click(pixel_x, pixel_y)
84
- self.click_coordinates = (pixel_x, pixel_y)
85
- self.logger.log(
86
- f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
87
- )
88
- time.sleep(1)
89
- return f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
90
-
91
- @tool
92
- def right_click(x: int, y: int) -> str:
93
- """
94
- Performs a right-click at the specified normalized coordinates
95
- Args:
96
- x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
97
- y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
98
- """
99
- pixel_x, pixel_y = self._normalize_to_pixel(x, y)
100
- self.desktop.right_click(pixel_x, pixel_y)
101
- self.click_coordinates = (pixel_x, pixel_y)
102
- self.logger.log(
103
- f"Right-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
104
- )
105
- return f"Right-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
106
-
107
- @tool
108
- def double_click(x: int, y: int) -> str:
109
- """
110
- Performs a double-click at the specified normalized coordinates
111
- Args:
112
- x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
113
- y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
114
- """
115
- pixel_x, pixel_y = self._normalize_to_pixel(x, y)
116
- self.desktop.double_click(pixel_x, pixel_y)
117
- self.click_coordinates = (pixel_x, pixel_y)
118
- self.logger.log(
119
- f"Double-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
120
- )
121
- return f"Double-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
122
-
123
- @tool
124
- def move_mouse(x: int, y: int) -> str:
125
- """
126
- Moves the mouse cursor to the specified normalized coordinates
127
- Args:
128
- x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
129
- y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
130
- """
131
- pixel_x, pixel_y = self._normalize_to_pixel(x, y)
132
- self.desktop.move_mouse(pixel_x, pixel_y)
133
- self.logger.log(
134
- f"Moved mouse to normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
135
- )
136
- return f"Moved mouse to normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
137
-
138
- def normalize_text(text):
139
- return "".join(
140
- c
141
- for c in unicodedata.normalize("NFD", text)
142
- if not unicodedata.combining(c)
143
- )
144
-
145
- @tool
146
- def write(text: str) -> str:
147
- """
148
- Types the specified text at the current cursor position.
149
- Args:
150
- text: The text to type
151
- """
152
- # clean_text = normalize_text(text)
153
- self.desktop.write(text, delay_in_ms=10)
154
- self.logger.log(f"Typed text: '{text}'")
155
- time.sleep(1)
156
- return f"Typed text: '{text}'"
157
-
158
- @tool
159
- def press(key: str) -> str:
160
- """
161
- Presses a keyboard key or combination of keys
162
- Args:
163
- key: The key to press (e.g. "enter", "space", "backspace", etc.) or a multiple keys string to press, for example "ctrl+a" or "ctrl+shift+a".
164
- """
165
- self.desktop.press(key)
166
- self.logger.log(f"Pressed key: {key}")
167
- time.sleep(0.1)
168
- return f"Pressed key: {key}"
169
-
170
- @tool
171
- def drag(x1: int, y1: int, x2: int, y2: int) -> str:
172
- """
173
- Clicks at normalized coordinates [x1, y1], drags mouse to [x2, y2], then release click.
174
- Args:
175
- x1: origin normalized x coordinate (0 to 1000)
176
- y1: origin normalized y coordinate (0 to 1000)
177
- x2: end normalized x coordinate (0 to 1000)
178
- y2: end normalized y coordinate (0 to 1000)
179
- """
180
- pixel_x1, pixel_y1 = self._normalize_to_pixel(x1, y1)
181
- pixel_x2, pixel_y2 = self._normalize_to_pixel(x2, y2)
182
- self.desktop.drag((pixel_x1, pixel_y1), (pixel_x2, pixel_y2))
183
- message = f"Dragged and dropped from normalized [{x1}, {y1}] to [{x2}, {y2}] -> pixels [{pixel_x1}, {pixel_y1}] to [{pixel_x2}, {pixel_y2}]"
184
- self.logger.log(message)
185
- return message
186
-
187
- @tool
188
- def scroll(
189
- x: int,
190
- y: int,
191
- direction: Literal["up", "down"] = "down",
192
- amount: int = 2,
193
- ) -> str:
194
- """
195
- Moves the mouse to selected normalized coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
196
- Args:
197
- x: The normalized x coordinate (0 to 1000) of the element to scroll/zoom
198
- y: The normalized y coordinate (0 to 1000) of the element to scroll/zoom
199
- direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
200
- amount: The amount to scroll. A good amount is 1 or 2.
201
- """
202
- pixel_x, pixel_y = self._normalize_to_pixel(x, y)
203
- self.desktop.move_mouse(pixel_x, pixel_y)
204
- self.desktop.scroll(direction=direction, amount=amount)
205
- message = f"Scrolled {direction} by {amount} at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
206
- self.logger.log(message)
207
- return message
208
-
209
- @tool
210
- def wait(seconds: float) -> str:
211
- """
212
- Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
213
- Args:
214
- seconds: Number of seconds to wait, generally 3 is enough.
215
- """
216
- time.sleep(seconds)
217
- self.logger.log(f"Waited for {seconds} seconds")
218
- return f"Waited for {seconds} seconds"
219
-
220
- @tool
221
- def open(file_or_url: str) -> str:
222
- """
223
- Directly opens a browser with the specified url or opens a file with the default application: use this at start of web searches rather than trying to click the browser or open a file by clicking.
224
- Args:
225
- file_or_url: The URL or file to open
226
- """
227
-
228
- self.desktop.open(file_or_url)
229
- # Give it time to load
230
- time.sleep(2)
231
- self.logger.log(f"Opening: {file_or_url}")
232
- return f"Opened: {file_or_url}"
233
-
234
- @tool
235
- def launch_app(app_name: str) -> str:
236
- """
237
- Launches the specified application.
238
- Args:
239
- app_name: the name of the application to launch
240
- """
241
- self.desktop.launch(app_name)
242
- self.logger.log(f"Launched app: {app_name}")
243
- return f"Launched app: {app_name}"
244
-
245
- @tool
246
- def execute(command: str) -> str:
247
- """
248
- Executes a terminal command in the desktop environment.
249
- Args:
250
- command: The command to execute
251
- """
252
- self.desktop.execute_command(command)
253
- self.logger.log(f"Executed command: {command}")
254
- return f"Executed command: {command}"
255
-
256
- @tool
257
- def refresh() -> str:
258
- """
259
- Refreshes the current web page if you're in a browser.
260
- """
261
- self.desktop.press(["ctrl", "r"])
262
- self.logger.log("Refreshed the current page")
263
- return "Refreshed the current page"
264
-
265
- @tool
266
- def go_back() -> str:
267
- """
268
- Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
269
- Args:
270
- """
271
- self.desktop.press(["alt", "left"])
272
- self.logger.log("Went back one page")
273
- return "Went back one page"
274
-
275
- # Register the tools
276
- self.tools["click"] = click
277
- self.tools["right_click"] = right_click
278
- self.tools["double_click"] = double_click
279
- self.tools["move_mouse"] = move_mouse
280
- self.tools["write"] = write
281
- self.tools["press"] = press
282
- self.tools["scroll"] = scroll
283
- self.tools["wait"] = wait
284
- self.tools["open"] = open
285
- self.tools["go_back"] = go_back
286
- self.tools["drag"] = drag
287
- self.tools["launch_app"] = launch_app
288
- self.tools["execute"] = execute
289
- self.tools["refresh"] = refresh
290
- self.tools["refresh"] = refresh
291
- self.tools["execute"] = execute
292
- self.tools["refresh"] = refresh
293
- self.tools["refresh"] = refresh
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cua2-core/src/cua2-core/services/agents/normalized_agent.py DELETED
@@ -1,282 +0,0 @@
1
- import time
2
- import unicodedata
3
- from typing import List, Literal
4
-
5
- # SmolaAgents imports
6
- from smolagents import Model, Tool, tool
7
- from smolagents.monitoring import LogLevel
8
-
9
- from backend.models.models import AgentType
10
- from backend.services.agents.prompt import NormalizedCoordinatesSystemPrompt
11
- from computer_use_studio import DesktopAgentBase, Sandbox
12
-
13
-
14
- class NormalizedAgent(DesktopAgentBase):
15
- """Agent for desktop automation with normalized coordinates (0.0 to 1.0)"""
16
-
17
- AGENT_TYPE = AgentType.NORMALIZED_COORDINATES
18
-
19
- def __init__(
20
- self,
21
- model: Model,
22
- data_dir: str,
23
- desktop: Sandbox,
24
- system_prompt: NormalizedCoordinatesSystemPrompt,
25
- tools: List[Tool] | None = None,
26
- max_steps: int = 20,
27
- verbosity_level: LogLevel = LogLevel.INFO,
28
- planning_interval: int | None = None,
29
- use_v1_prompt: bool = False,
30
- **kwargs,
31
- ):
32
- super().__init__(
33
- model=model,
34
- data_dir=data_dir,
35
- desktop=desktop,
36
- system_prompt=system_prompt,
37
- tools=tools,
38
- max_steps=max_steps,
39
- verbosity_level=verbosity_level,
40
- planning_interval=planning_interval,
41
- use_v1_prompt=use_v1_prompt,
42
- **kwargs,
43
- )
44
-
45
- def _normalize_to_pixel(self, norm_x: float, norm_y: float) -> tuple[int, int]:
46
- """
47
- Convert normalized coordinates (0.0-1.0) to pixel coordinates
48
- Args:
49
- norm_x: Normalized x coordinate (0.0 to 1.0)
50
- norm_y: Normalized y coordinate (0.0 to 1.0)
51
- Returns:
52
- Tuple of (pixel_x, pixel_y)
53
- """
54
- # Clamp values to valid range
55
- norm_x = max(0.0, min(1.0, norm_x))
56
- norm_y = max(0.0, min(1.0, norm_y))
57
-
58
- pixel_x = int(norm_x * self.width)
59
- pixel_y = int(norm_y * self.height)
60
-
61
- # Ensure we don't go outside screen bounds
62
- pixel_x = max(0, min(self.width - 1, pixel_x))
63
- pixel_y = max(0, min(self.height - 1, pixel_y))
64
-
65
- return pixel_x, pixel_y
66
-
67
- def _setup_desktop_tools(self):
68
- """Register all desktop tools with normalized coordinate support"""
69
-
70
- @tool
71
- def click(x: float, y: float) -> str:
72
- """
73
- Performs a left-click at the specified normalized coordinates
74
- Args:
75
- x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
76
- y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
77
- """
78
- pixel_x, pixel_y = self._normalize_to_pixel(x, y)
79
- self.desktop.left_click(pixel_x, pixel_y)
80
- self.click_coordinates = (pixel_x, pixel_y)
81
- self.logger.log(
82
- f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
83
- )
84
- return f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
85
-
86
- @tool
87
- def right_click(x: float, y: float) -> str:
88
- """
89
- Performs a right-click at the specified normalized coordinates
90
- Args:
91
- x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
92
- y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
93
- """
94
- pixel_x, pixel_y = self._normalize_to_pixel(x, y)
95
- self.desktop.right_click(pixel_x, pixel_y)
96
- self.click_coordinates = (pixel_x, pixel_y)
97
- self.logger.log(
98
- f"Right-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
99
- )
100
- return f"Right-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
101
-
102
- @tool
103
- def double_click(x: float, y: float) -> str:
104
- """
105
- Performs a double-click at the specified normalized coordinates
106
- Args:
107
- x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
108
- y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
109
- """
110
- pixel_x, pixel_y = self._normalize_to_pixel(x, y)
111
- self.desktop.double_click(pixel_x, pixel_y)
112
- self.click_coordinates = (pixel_x, pixel_y)
113
- self.logger.log(
114
- f"Double-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
115
- )
116
- return f"Double-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
117
-
118
- @tool
119
- def move_mouse(x: float, y: float) -> str:
120
- """
121
- Moves the mouse cursor to the specified normalized coordinates
122
- Args:
123
- x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
124
- y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
125
- """
126
- pixel_x, pixel_y = self._normalize_to_pixel(x, y)
127
- self.desktop.move_mouse(pixel_x, pixel_y)
128
- self.logger.log(
129
- f"Moved mouse to normalized coordinates ({pixel_x}, {pixel_y})"
130
- )
131
- return f"Moved mouse to normalized coordinates ({pixel_x}, {pixel_y})"
132
-
133
- def normalize_text(text):
134
- return "".join(
135
- c
136
- for c in unicodedata.normalize("NFD", text)
137
- if not unicodedata.combining(c)
138
- )
139
-
140
- @tool
141
- def write(text: str) -> str:
142
- """
143
- Types the specified text at the current cursor position.
144
- Args:
145
- text: The text to type
146
- """
147
- # clean_text = normalize_text(text)
148
- self.desktop.write(text, delay_in_ms=10)
149
- self.logger.log(f"Typed text: '{text}'")
150
- return f"Typed text: '{text}'"
151
-
152
- @tool
153
- def press(key: str) -> str:
154
- """
155
- Presses a keyboard key or combination of keys
156
- Args:
157
- key: The key to press (e.g. "enter", "space", "backspace", etc.) or a multiple keys string to press, for example "ctrl+a" or "ctrl+shift+a".
158
- """
159
- self.desktop.press(key)
160
- self.logger.log(f"Pressed key: {key}")
161
- return f"Pressed key: {key}"
162
-
163
- @tool
164
- def drag(x1: float, y1: float, x2: float, y2: float) -> str:
165
- """
166
- Clicks at normalized coordinates [x1, y1], drags mouse to [x2, y2], then release click.
167
- Args:
168
- x1: origin normalized x coordinate (0.0 to 1.0)
169
- y1: origin normalized y coordinate (0.0 to 1.0)
170
- x2: end normalized x coordinate (0.0 to 1.0)
171
- y2: end normalized y coordinate (0.0 to 1.0)
172
- """
173
- pixel_x1, pixel_y1 = self._normalize_to_pixel(x1, y1)
174
- pixel_x2, pixel_y2 = self._normalize_to_pixel(x2, y2)
175
- self.desktop.drag((pixel_x1, pixel_y1), (pixel_x2, pixel_y2))
176
- message = f"Dragged and dropped from normalized [{pixel_x1}, {pixel_y1}] to [{pixel_x2}, {pixel_y2}]"
177
- self.logger.log(message)
178
- return message
179
-
180
- @tool
181
- def scroll(
182
- x: float,
183
- y: float,
184
- direction: Literal["up", "down"] = "down",
185
- amount: int = 2,
186
- ) -> str:
187
- """
188
- Moves the mouse to selected normalized coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
189
- Args:
190
- x: The normalized x coordinate (0.0 to 1.0) of the element to scroll/zoom
191
- y: The normalized y coordinate (0.0 to 1.0) of the element to scroll/zoom
192
- direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
193
- amount: The amount to scroll. A good amount is 1 or 2.
194
- """
195
- pixel_x, pixel_y = self._normalize_to_pixel(x, y)
196
- self.desktop.move_mouse(pixel_x, pixel_y)
197
- self.desktop.scroll(direction=direction, amount=amount)
198
- message = f"Scrolled {direction} by {amount} at normalized coordinates ({pixel_x}, {pixel_y})"
199
- self.logger.log(message)
200
- return message
201
-
202
- @tool
203
- def wait(seconds: float) -> str:
204
- """
205
- Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
206
- Args:
207
- seconds: Number of seconds to wait, generally 3 is enough.
208
- """
209
- time.sleep(seconds)
210
- self.logger.log(f"Waited for {seconds} seconds")
211
- return f"Waited for {seconds} seconds"
212
-
213
- @tool
214
- def open(file_or_url: str) -> str:
215
- """
216
- Directly opens a browser with the specified url or opens a file with the default application: use this at start of web searches rather than trying to click the browser or open a file by clicking.
217
- Args:
218
- file_or_url: The URL or file to open
219
- """
220
-
221
- self.desktop.open(file_or_url)
222
- # Give it time to load
223
- time.sleep(2)
224
- self.logger.log(f"Opening: {file_or_url}")
225
- return f"Opened: {file_or_url}"
226
-
227
- @tool
228
- def launch_app(app_name: str) -> str:
229
- """
230
- Launches the specified application.
231
- Args:
232
- app_name: the name of the application to launch
233
- """
234
- self.desktop.launch(app_name)
235
- self.logger.log(f"Launched app: {app_name}")
236
- return f"Launched app: {app_name}"
237
-
238
- @tool
239
- def execute(command: str) -> str:
240
- """
241
- Executes a terminal command in the desktop environment.
242
- Args:
243
- command: The command to execute
244
- """
245
- self.desktop.execute_command(command)
246
- self.logger.log(f"Executed command: {command}")
247
- return f"Executed command: {command}"
248
-
249
- @tool
250
- def refresh() -> str:
251
- """
252
- Refreshes the current web page if you're in a browser.
253
- """
254
- self.desktop.press(["ctrl", "r"])
255
- self.logger.log("Refreshed the current page")
256
- return "Refreshed the current page"
257
-
258
- @tool
259
- def go_back() -> str:
260
- """
261
- Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
262
- Args:
263
- """
264
- self.desktop.press(["alt", "left"])
265
- self.logger.log("Went back one page")
266
- return "Went back one page"
267
-
268
- # Register the tools
269
- self.tools["click"] = click
270
- self.tools["right_click"] = right_click
271
- self.tools["double_click"] = double_click
272
- self.tools["move_mouse"] = move_mouse
273
- self.tools["write"] = write
274
- self.tools["press"] = press
275
- self.tools["scroll"] = scroll
276
- self.tools["wait"] = wait
277
- self.tools["open"] = open
278
- self.tools["go_back"] = go_back
279
- self.tools["drag"] = drag
280
- self.tools["launch_app"] = launch_app
281
- self.tools["execute"] = execute
282
- self.tools["refresh"] = refresh
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cua2-core/src/cua2-core/services/agents/pixel_coordonates_agent.py DELETED
@@ -1,317 +0,0 @@
1
- import time
2
- import unicodedata
3
- from typing import List, Literal
4
-
5
- # SmolaAgents imports
6
- from smolagents import Model, Tool, tool
7
- from smolagents.monitoring import LogLevel
8
-
9
- from backend.models.models import AgentType
10
- from backend.services.agents.prompt import PixelCoordinatesSystemPrompt
11
- from computer_use_studio import DesktopAgentBase, Sandbox
12
-
13
-
14
- class PixelCoordinatesAgent(DesktopAgentBase):
15
- """Agent for desktop automation"""
16
-
17
- AGENT_TYPE = AgentType.PIXEL_COORDINATES
18
-
19
- def __init__(
20
- self,
21
- model: Model,
22
- data_dir: str,
23
- desktop: Sandbox,
24
- system_prompt: PixelCoordinatesSystemPrompt,
25
- tools: List[Tool] | None = None,
26
- max_steps: int = 20,
27
- verbosity_level: LogLevel = LogLevel.INFO,
28
- planning_interval: int | None = None,
29
- use_v1_prompt: bool = False,
30
- **kwargs,
31
- ):
32
- super().__init__(
33
- model=model,
34
- data_dir=data_dir,
35
- desktop=desktop,
36
- system_prompt=system_prompt,
37
- tools=tools,
38
- max_steps=max_steps,
39
- verbosity_level=verbosity_level,
40
- planning_interval=planning_interval,
41
- use_v1_prompt=use_v1_prompt,
42
- **kwargs,
43
- )
44
-
45
- # OPTIONAL: Add a custom prompt template - see src/computer_use_studio/desktop_agent/desktop_agent_base.py for more details about the default prompt template
46
- # self.prompt_templates["system_prompt"] = CUSTOM_PROMPT_TEMPLATE.replace(
47
- # "<<resolution_x>>", str(self.width)
48
- # ).replace("<<resolution_y>>", str(self.height))
49
- # Important: Change the prompt to get better results, depending on your action space.
50
-
51
- def _setup_desktop_tools(self):
52
- """Register all desktop tools"""
53
-
54
- @tool
55
- def click(x: int, y: int) -> str:
56
- """
57
- Performs a left-click at the specified coordinates
58
- Args:
59
- x: The x coordinate (horizontal position)
60
- y: The y coordinate (vertical position)
61
- """
62
- self.desktop.left_click(x, y)
63
- self.click_coordinates = (x, y)
64
- self.logger.log(f"Clicked at coordinates ({x}, {y})")
65
- return f"Clicked at coordinates ({x}, {y})"
66
-
67
- @tool
68
- def right_click(x: int, y: int) -> str:
69
- """
70
- Performs a right-click at the specified coordinates
71
- Args:
72
- x: The x coordinate (horizontal position)
73
- y: The y coordinate (vertical position)
74
- """
75
- self.desktop.right_click(x, y)
76
- self.click_coordinates = (x, y)
77
- self.logger.log(f"Right-clicked at coordinates ({x}, {y})")
78
- return f"Right-clicked at coordinates ({x}, {y})"
79
-
80
- @tool
81
- def double_click(x: int, y: int) -> str:
82
- """
83
- Performs a double-click at the specified coordinates
84
- Args:
85
- x: The x coordinate (horizontal position)
86
- y: The y coordinate (vertical position)
87
- """
88
- self.desktop.double_click(x, y)
89
- self.click_coordinates = (x, y)
90
- self.logger.log(f"Double-clicked at coordinates ({x}, {y})")
91
- return f"Double-clicked at coordinates ({x}, {y})"
92
-
93
- @tool
94
- def move_mouse(x: int, y: int) -> str:
95
- """
96
- Moves the mouse cursor to the specified coordinates
97
- Args:
98
- x: The x coordinate (horizontal position)
99
- y: The y coordinate (vertical position)
100
- """
101
- self.desktop.move_mouse(x, y)
102
- self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
103
- return f"Moved mouse to coordinates ({x}, {y})"
104
-
105
- def normalize_text(text):
106
- return "".join(
107
- c
108
- for c in unicodedata.normalize("NFD", text)
109
- if not unicodedata.combining(c)
110
- )
111
-
112
- @tool
113
- def write(text: str) -> str:
114
- """
115
- Types the specified text at the current cursor position.
116
- Args:
117
- text: The text to type
118
- """
119
- # clean_text = normalize_text(text)
120
- self.desktop.write(text, delay_in_ms=10)
121
- self.logger.log(f"Typed text: '{text}'")
122
- return f"Typed text: '{text}'"
123
-
124
- @tool
125
- def press(key: str) -> str:
126
- """
127
- Presses a keyboard key or combination of keys
128
- Args:
129
- key: The key to press (e.g. "enter", "space", "backspace", etc.) or a multiple keys string to press, for example "ctrl+a" or "ctrl+shift+a".
130
- """
131
- self.desktop.press(key)
132
- self.logger.log(f"Pressed key: {key}")
133
- return f"Pressed key: {key}"
134
-
135
- @tool
136
- def drag(x1: int, y1: int, x2: int, y2: int) -> str:
137
- """
138
- Clicks [x1, y1], drags mouse to [x2, y2], then release click.
139
- Args:
140
- x1: origin x coordinate
141
- y1: origin y coordinate
142
- x2: end x coordinate
143
- y2: end y coordinate
144
- """
145
- self.desktop.drag((x1, y1), (x2, y2))
146
- message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
147
- self.logger.log(message)
148
- return message
149
-
150
- @tool
151
- def scroll(
152
- x: int, y: int, direction: Literal["up", "down"] = "down", amount: int = 2
153
- ) -> str:
154
- """
155
- Moves the mouse to selected coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
156
- Args:
157
- x: The x coordinate (horizontal position) of the element to scroll/zoom
158
- y: The y coordinate (vertical position) of the element to scroll/zoom
159
- direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
160
- amount: The amount to scroll. A good amount is 1 or 2.
161
- """
162
- self.desktop.move_mouse(x, y)
163
- self.desktop.scroll(direction=direction, amount=amount)
164
- message = f"Scrolled {direction} by {amount}"
165
- self.logger.log(message)
166
- return message
167
-
168
- @tool
169
- def wait(seconds: float) -> str:
170
- """
171
- Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
172
- Args:
173
- seconds: Number of seconds to wait, generally 3 is enough.
174
- """
175
- time.sleep(seconds)
176
- self.logger.log(f"Waited for {seconds} seconds")
177
- return f"Waited for {seconds} seconds"
178
-
179
- @tool
180
- def open(file_or_url: str) -> str:
181
- """
182
- Directly opens a browser with the specified url or opens a file with the default application: use this at start of web searches rather than trying to click the browser or open a file by clicking.
183
- Args:
184
- file_or_url: The URL or file to open
185
- """
186
-
187
- self.desktop.open(file_or_url)
188
- # Give it time to load
189
- time.sleep(2)
190
- self.logger.log(f"Opening: {file_or_url}")
191
- return f"Opened: {file_or_url}"
192
-
193
- @tool
194
- def launch_app(app_name: str) -> str:
195
- """
196
- Launches the specified application.
197
- Args:
198
- app_name: the name of the application to launch
199
- """
200
- self.desktop.launch(app_name)
201
- self.logger.log(f"Launched app: {app_name}")
202
- return f"Launched app: {app_name}"
203
-
204
- @tool
205
- def execute(command: str) -> str:
206
- """
207
- Executes a terminal command in the desktop environment.
208
- Args:
209
- command: The command to execute
210
- """
211
- self.desktop.execute_command(command)
212
- self.logger.log(f"Executed command: {command}")
213
- return f"Executed command: {command}"
214
-
215
- @tool
216
- def refresh() -> str:
217
- """
218
- Refreshes the current web page if you're in a browser.
219
- """
220
- self.desktop.press(["ctrl", "r"])
221
- self.logger.log("Refreshed the current page")
222
- return "Refreshed the current page"
223
-
224
- @tool
225
- def go_back() -> str:
226
- """
227
- Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
228
- Args:
229
- """
230
- self.desktop.press(["alt", "left"])
231
- self.logger.log("Went back one page")
232
- return "Went back one page"
233
-
234
- # Register the tools
235
- self.tools["click"] = click
236
- self.tools["right_click"] = right_click
237
- self.tools["double_click"] = double_click
238
- self.tools["move_mouse"] = move_mouse
239
- self.tools["write"] = write
240
- self.tools["press"] = press
241
- self.tools["scroll"] = scroll
242
- self.tools["wait"] = wait
243
- self.tools["open"] = open
244
- self.tools["go_back"] = go_back
245
- self.tools["drag"] = drag
246
- self.tools["launch_app"] = launch_app
247
- self.tools["execute"] = execute
248
- self.tools["refresh"] = refresh
249
-
250
-
251
- if __name__ == "__main__":
252
- # ================================
253
- # MODEL CONFIGURATION
254
- # ================================
255
-
256
- # import os
257
-
258
- # from smolagents import OpenAIServerModel
259
-
260
- # model = OpenAIServerModel(
261
- # model_id="gpt-4.1",
262
- # api_key=os.getenv("OPENAI_API_KEY"),
263
- # )
264
-
265
- # For Inference Endpoints
266
- # from smolagents import HfApiModel
267
- # model = HfApiModel(
268
- # model_id="Qwen/Qwen2.5-VL-72B-Instruct",
269
- # token=os.getenv("HF_TOKEN"),
270
- # provider="nebius",
271
- # )
272
-
273
- # For Transformer models
274
- # from smolagents import TransformersModel
275
- # model = TransformersModel(
276
- # model_id="Qwen/Qwen2.5-VL-72B-Instruct",
277
- # device_map="auto",
278
- # torch_dtype="auto",
279
- # trust_remote_code=True,
280
- # )
281
-
282
- # For other providers
283
- from smolagents import LiteLLMModel
284
-
285
- model = LiteLLMModel(model_id="anthropic/claude-sonnet-4-5-20250929")
286
- # model = LiteLLMModel(model_id="gemini/gemini-2.5-flash")
287
-
288
- # ================================
289
- # RUN AGENT
290
- # ================================
291
-
292
- # Interactive task input loop
293
- sandbox = None
294
- agent = None
295
- while True:
296
- try:
297
- task = get_user_input()
298
- if task is None:
299
- exit()
300
- sandbox = Sandbox(headless=False, resolution=(1024, 1024))
301
- sandbox.start_recording()
302
- agent = FormAgent(model=model, data_dir="data", desktop=sandbox)
303
-
304
- print("\nπŸ€– Agent is working on your task...")
305
- print("-" * 60)
306
- result = agent.run(task)
307
- print("\nβœ… Task completed successfully!")
308
- print(f"πŸ“„ Result: {result}")
309
- except Exception as e:
310
- print(f"\n❌ Error occurred: {str(e)}")
311
- finally:
312
- if sandbox:
313
- sandbox.end_recording("recording.mp4")
314
- if agent:
315
- agent.close()
316
-
317
- print("\n" + "=" * 60)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cua2-core/src/cua2-core/services/agents/prompt.py DELETED
@@ -1,548 +0,0 @@
1
- from enum import Enum
2
-
3
-
4
- class PixelCoordinatesSystemPrompt(Enum):
5
- """Pixel coordinates system prompt"""
6
-
7
- FORM_SYSTEM_PROMPT = """You are a web form automation assistant that can control a remote desktop environment with a web browser open.
8
- The current date is <<current_date>>.
9
-
10
- <action_process>
11
- You will be given a task to complete in several steps (e.g. filling forms, signing up, logging in, submitting claims).
12
- At each step you will perform **one action**.
13
- After each action, you will receive an updated screenshot.
14
- Then you will proceed as follows, with these sections β€” do not skip any:
15
-
16
- Short term goal: ...
17
- What I see: ...
18
- Reflection: ...
19
- Action:
20
- ```python
21
- tool_name(arguments)
22
- ```<end_code>
23
-
24
- Always format your Action section as **Python code blocks** exactly as shown above.
25
- </action_process>
26
-
27
- <tools>
28
- On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
29
- {%- for tool in tools.values() %}
30
- - {{ tool.name }}: {{ tool.description }}
31
- Takes inputs: {{tool.inputs}}
32
- Returns an output of type: {{tool.output_type}}
33
- {%- endfor %}
34
- </tools>
35
-
36
- <web_form_guidelines>
37
- Look at the elements on the screen (input fields, checkboxes, buttons, dropdowns) to decide where to interact.
38
- The desktop has a resolution of <<resolution_x>>x<<resolution_y>> pixels β€” use that to decide mouse coordinates.
39
- **Never use hypothetical or assumed coordinates; always use real coordinates visible on the screenshot.**
40
-
41
- ### Typical Web Form Interactions
42
- - **Input fields**: click in the field first to focus it, then use `write("text")`.
43
- - **Passwords**: type them just like text β€” `write("password123")`.
44
- - **Checkboxes / radio buttons**: use `click(x,y)` directly on the box/circle.
45
- - **Dropdown menus**: click to open, then click the option or use arrow keys + press("enter").
46
- - **Submit buttons**: identify clearly labelled β€œSign up”, β€œSign in”, β€œSubmit” buttons and click at their coordinates.
47
- - **Captcha or 2FA**: wait or prompt for external handling (do not bypass security).
48
- - **Pop-ups**: try `press("enter")` to confirm or `press("escape")` to close if they block your action.
49
-
50
- ### Grouping Multiple Inputs
51
- - If multiple fields, checkboxes, or similar controls are clearly visible **and** can be filled/clicked in sequence without ambiguity, you may include **several actions in one code block**.
52
- - Keep each Action still in a **Python code block** but with multiple tool calls inside, for example:
53
- ```python
54
- click(450, 320) # Email field
55
- wait(0.1)
56
- write("[email protected]")
57
- click(450, 380) # Password field
58
- wait(0.1)
59
- write("mypassword123")
60
- click(430, 600) # Checkbox β€œAccept terms”
61
- wait(0.1)
62
- ```<end_code>
63
- - Only group actions when:
64
- 1. They’re all part of the **same form or step**,
65
- 2. The screenshot clearly shows all elements and coordinates,
66
- 3. The order of operations is obvious.
67
- - Otherwise, default back to one Action per step.
68
-
69
- ### Precision
70
- - Always **click before typing** to ensure the right field is active.
71
- - Always **scroll if needed** to bring elements into view before clicking.
72
- - Always **validate each action** via the screenshot before continuing.
73
-
74
- </web_form_guidelines>
75
-
76
- <task_resolution_example>
77
- For a task like β€œSign up for an account and submit the form”:
78
-
79
- Step 1:
80
- Short term goal: I want to open the signup page.
81
- What I see: The browser is open on the homepage.
82
- Reflection: I will open the signup URL directly.
83
- Action:
84
- ```python
85
- open("https://example.com/signup")
86
- wait(3)
87
- ```<end_code>
88
-
89
- Step 2:
90
- Short term goal: I want to fill the β€œEmail” field.
91
- What I see: I see the signup form with an β€œEmail” field at (450, 320).
92
- Reflection: I will click inside the field then type my email.
93
- Action:
94
- ```python
95
- click(450, 320)
96
- write("[email protected]")
97
- ```<end_code>
98
-
99
- Step 3:
100
- Short term goal: I want to check the β€œI accept terms” checkbox.
101
- What I see: The checkbox is at (430, 600).
102
- Reflection: I will click it.
103
- Action:
104
- ```python
105
- click(430, 600)
106
- ```<end_code>
107
-
108
- Step 4:
109
- Short term goal: I want to submit the form.
110
- What I see: The β€œSign Up” button at (500, 700).
111
- Reflection: I will click the button to submit.
112
- Action:
113
- ```python
114
- click(500, 700)
115
- wait(3)
116
- ```<end_code>
117
-
118
- Step 5:
119
- Short term goal: Verify signup completed.
120
- What I see: A confirmation page β€œWelcome [email protected]”.
121
- Reflection: Task complete.
122
- Action:
123
- ```python
124
- final_answer("Signup completed")
125
- ```<end_code>
126
- </task_resolution_example>
127
-
128
- <general_guidelines>
129
- # GUI Agent Guidelines for Web Forms
130
-
131
- ## Environment Overview
132
- Ubuntu 22.04 XFCE4 desktop with Google Chrome/Chromium browser.
133
- Agent can fill forms, sign up, sign in, click checkboxes, submit claims.
134
-
135
- ## Core Principles
136
-
137
- ### 1. Screenshot Analysis
138
- - Always analyze the latest screenshot carefully before each action.
139
- - Validate that previous actions worked by examining the current state.
140
- - If an action didn’t work, try an alternative rather than repeating blindly.
141
-
142
- ### 2. Action Execution
143
- - Execute one action or multiple actions at a time (grouped in one code block).
144
- - Wait for appropriate loading times using `wait()` but not indefinitely.
145
- - Scroll to bring hidden elements into view.
146
-
147
- ### 3. Keyboard Shortcuts
148
- - Use `tab` to move between fields, `space` to toggle checkboxes, `enter` to submit forms.
149
- - Copy/paste: `ctrl+C`, `ctrl+V`.
150
- - Refresh page: `refresh()`.
151
-
152
- ### 4. Error Recovery
153
- - If clicking doesn’t work, try double_click or right_click.
154
- - If typing doesn’t appear, ensure the field is focused with click.
155
- - If popups block the screen, try `press("enter")` or `press("escape")`.
156
-
157
- ### 5. Security & Privacy
158
- - Don’t attempt to bypass captchas or 2FA automatically.
159
- - Don’t store credentials in plain text unless instructed.
160
-
161
- ### 6. Final Answer
162
- - When the form is successfully submitted or the goal achieved, use:
163
- ```python
164
- final_answer("Done")
165
- ```<end_code>
166
- </general_guidelines>
167
- """
168
-
169
-
170
- class Normalized1000CoordinatesSystemPrompt(Enum):
171
- """Normalized 1000 coordinates system prompt"""
172
-
173
- FORM_SYSTEM_PROMPT = """You are a web form automation assistant that can control a remote desktop environment with a web browser open.
174
- The current date is <<current_date>>.
175
-
176
- <action_process>
177
- You will be given a task to complete in several steps (e.g. filling forms, signing up, logging in, submitting claims).
178
- At each step you will perform **one action**.
179
- After each action, you will receive an updated screenshot.
180
- Then you will proceed as follows, with these sections β€” do not skip any:
181
-
182
- Short term goal: ...
183
- What I see: ...
184
- Reflection: ...
185
- Action:
186
- ```python
187
- tool_name(arguments)
188
- ```<end_code>
189
-
190
- Always format your Action section as **Python code blocks** exactly as shown above.
191
- </action_process>
192
-
193
- <tools>
194
- On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
195
- {%- for tool in tools.values() %}
196
- - {{ tool.name }}: {{ tool.description }}
197
- Takes inputs: {{tool.inputs}}
198
- Returns an output of type: {{tool.output_type}}
199
- {%- endfor %}
200
- </tools>
201
-
202
- <coordinate_system>
203
- **IMPORTANT: This system uses NORMALIZED COORDINATES (0 to 1000)**
204
-
205
- You must use normalized coordinates:
206
- - **x-coordinate**: 0 = left edge, 1000 = right edge of screen
207
- - **y-coordinate**: 0 = top edge, 1000 = bottom edge of screen
208
- - **Example**: Center of screen is (500, 500)
209
- - **Example**: Top-left corner is (0, 0)
210
- - **Example**: Bottom-right corner is (1000, 1000)
211
-
212
- When you see an element on the screenshot:
213
- 1. Estimate its position relative to the screen dimensions
214
- 2. Convert to normalized coordinates between 0 and 1000
215
- 3. Use these normalized coordinates in your tool calls
216
-
217
- **Never use pixel coordinates directly - always use normalized coordinates between 0 and 1000**
218
- </coordinate_system>
219
-
220
- <web_form_guidelines>
221
- Look at the elements on the screen (input fields, checkboxes, buttons, dropdowns) to decide where to interact.
222
- **Always use normalized coordinates (0 to 1000) based on the element's relative position on the screen.**
223
-
224
- ### Typical Web Form Interactions
225
- - **Input fields**: click in the field first to focus it, then use `write("text")`.
226
- - **Passwords**: type them just like text β€” `write("password123")`.
227
- - **Checkboxes / radio buttons**: use `click(x,y)` directly on the box/circle. Click on the box/circle itself at the left side of the text, not on the text label.
228
- - **Dropdown menus**: click to open, then click the option or use arrow keys + press("enter").
229
- - **Submit buttons**: identify clearly labelled "Sign up", "Sign in", "Submit" buttons and click at their normalized coordinates.
230
- - **Captcha or 2FA**: wait or prompt for external handling (do not bypass security).
231
- - **Pop-ups**: try `press("enter")` to confirm or `press("escape")` to close if they block your action.
232
-
233
- ### Grouping Multiple Inputs
234
- - If multiple fields, checkboxes, or similar controls are clearly visible **and** can be filled/clicked in sequence without ambiguity, you may include **several actions in one code block**.
235
- - Keep each Action still in a **Python code block** but with multiple tool calls inside, for example:
236
- ```python
237
- click(470, 300) # Email field (normalized coordinates)
238
- write("[email protected]")
239
- click(470, 350) # Password field (normalized coordinates)
240
- write("mypassword123")
241
- click(450, 550) # Checkbox left side of the text "Accept terms" (normalized coordinates)
242
- ```<end_code>
243
-
244
- - Only group actions when:
245
- 1. They're all part of the **same form or step**,
246
- 2. The screenshot clearly shows all elements and coordinates,
247
- 3. The order of operations is obvious.
248
- - Otherwise, default back to one Action per step.
249
-
250
- ### Precision
251
- - Always **click before typing** to ensure the right field is active.
252
- - Always **scroll if needed** to bring elements into view before clicking.
253
- - Always **validate each action** via the screenshot before continuing.
254
- - Always use **normalized coordinates between 0 and 1000**.
255
- </web_form_guidelines>
256
-
257
- <task_resolution_example>
258
- For a task like "Sign up for an account and submit the form":
259
-
260
- Step 1:
261
- Short term goal: I want to open the signup page.
262
- What I see: The browser is open on the homepage.
263
- Reflection: I will open the signup URL directly.
264
- Action:
265
- ```python
266
- open("https://example.com/signup")
267
- wait(3)
268
- ```<end_code>
269
-
270
- Step 2:
271
- Short term goal: I want to fill the form fields that are currently visible.
272
- What I see: I see the signup form with "Email" and "Password" fields, plus a checkbox for accepting terms.
273
- Reflection: I will fill all the visible form fields in sequence - click the email field and type the email, then click the password field and type the password, then click the checkbox to accept terms.
274
- Action:
275
- ```python
276
- click(470, 300) # Email field (normalized coordinates)
277
- write("[email protected]")
278
- click(470, 350) # Password field (normalized coordinates)
279
- write("mypassword123")
280
- click(450, 550) # Checkbox left side of the text "Accept terms" (normalized coordinates)
281
- ```<end_code>
282
-
283
- Step 3:
284
- Short term goal: I need to scroll down to see the "Sign Up" button.
285
- What I see: The form fields are filled, but I cannot see the "Sign Up" button - it's likely below the current view.
286
- Reflection: I will scroll down to bring the submit button into view so I can click it in the next step.
287
- Action:
288
- ```python
289
- scroll(500, 500, "down", 3)
290
- ```<end_code>
291
-
292
- Step 4:
293
- Short term goal: I want to submit the form.
294
- What I see: The "Sign Up" button is at the bottom center, around 520, 650 in normalized coordinates.
295
- Reflection: I will click the button to submit.
296
- Action:
297
- ```python
298
- click(520, 650)
299
- wait(3)
300
- ```<end_code>
301
-
302
- Step 5:
303
- Short term goal: Verify signup completed.
304
- What I see: A confirmation page "Welcome [email protected]".
305
- Reflection: Task complete.
306
- Action:
307
- ```python
308
- final_answer("Signup completed")
309
- ```<end_code>
310
- </task_resolution_example>
311
-
312
- <general_guidelines>
313
- # GUI Agent Guidelines for Web Forms (0-1000 Coordinates)
314
-
315
- ## Environment Overview
316
- Ubuntu 22.04 XFCE4 desktop with Google Chrome/Chromium browser.
317
- Agent can fill forms, sign up, sign in, click checkboxes, submit claims.
318
- **All coordinates are normalized between 0 and 1000.**
319
-
320
- ## Core Principles
321
-
322
- ### 1. Screenshot Analysis
323
- - Always analyze the latest screenshot carefully before each action.
324
- - Validate that previous actions worked by examining the current state.
325
- - If an action didn't work, try an alternative rather than repeating blindly.
326
-
327
- ### 2. Action Execution
328
- - Execute one or multiple actions at a time (grouped in one code block).
329
- - Wait for appropriate loading times using `wait()` but not indefinitely.
330
- - Scroll to bring hidden elements into view.
331
-
332
- ### 3. Coordinate System
333
- - **CRITICAL**: Always use normalized coordinates (0 to 1000)
334
- - Convert visual position on screen to normalized coordinates
335
- - Center of screen = (500, 500)
336
- - Top-left = (0, 0), Bottom-right = (1000, 1000)
337
-
338
- ### 4. Keyboard Shortcuts
339
- - Use `tab` to move between fields, `space` to toggle checkboxes, `enter` to submit forms.
340
- - Copy/paste: `ctrl+C`, `ctrl+V`.
341
- - Refresh page: `refresh()`.
342
-
343
- ### 5. Error Recovery
344
- - If clicking doesn't work, try double_click or right_click.
345
- - If typing doesn't appear, ensure the field is focused with click.
346
- - If popups block the screen, try `press("enter")` or `press("escape")`.
347
-
348
- ### 6. Security & Privacy
349
- - Don't attempt to bypass captchas or 2FA automatically.
350
- - Don't store credentials in plain text unless instructed.
351
-
352
- ### 7. Final Answer
353
- - When the form is successfully submitted or the goal achieved, use:
354
- ```python
355
- final_answer("Done")
356
- ```<end_code>
357
- </general_guidelines>
358
- """
359
-
360
-
361
- class NormalizedCoordinatesSystemPrompt(Enum):
362
- """Normalized coordinates system prompt"""
363
-
364
- FORM_SYSTEM_PROMPT = """You are a web form automation assistant that can control a remote desktop environment with a web browser open.
365
- The current date is <<current_date>>.
366
-
367
- <action_process>
368
- You will be given a task to complete in several steps (e.g. filling forms, signing up, logging in, submitting claims).
369
- At each step you will perform **one action**.
370
- After each action, you will receive an updated screenshot.
371
- Then you will proceed as follows, with these sections β€” do not skip any:
372
-
373
- Short term goal: ...
374
- What I see: ...
375
- Reflection: ...
376
- Action:
377
- ```python
378
- tool_name(arguments)
379
- ```<end_code>
380
-
381
- Always format your Action section as **Python code blocks** exactly as shown above.
382
- </action_process>
383
-
384
- <tools>
385
- On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
386
- {%- for tool in tools.values() %}
387
- - {{ tool.name }}: {{ tool.description }}
388
- Takes inputs: {{tool.inputs}}
389
- Returns an output of type: {{tool.output_type}}
390
- {%- endfor %}
391
- </tools>
392
-
393
- <coordinate_system>
394
- **IMPORTANT: This system uses NORMALIZED COORDINATES (0.0 to 1.0)**
395
-
396
- You must use normalized coordinates:
397
- - **x-coordinate**: 0.0 = left edge, 1.0 = right edge of screen
398
- - **y-coordinate**: 0.0 = top edge, 1.0 = bottom edge of screen
399
- - **Example**: Center of screen is (0.5, 0.5)
400
- - **Example**: Top-left corner is (0.0, 0.0)
401
- - **Example**: Bottom-right corner is (1.0, 1.0)
402
-
403
- When you see an element on the screenshot:
404
- 1. Estimate its position relative to the screen dimensions
405
- 2. Convert to normalized coordinates between 0.0 and 1.0
406
- 3. Use these normalized coordinates in your tool calls
407
-
408
- **Never use pixel coordinates directly - always use normalized coordinates between 0.0 and 1.0**
409
- </coordinate_system>
410
-
411
- <web_form_guidelines>
412
- Look at the elements on the screen (input fields, checkboxes, buttons, dropdowns) to decide where to interact.
413
- **Always use normalized coordinates (0.0 to 1.0) based on the element's relative position on the screen.**
414
-
415
- ### Typical Web Form Interactions
416
- - **Input fields**: click in the field first to focus it, then use `write("text")`.
417
- - **Passwords**: type them just like text β€” `write("password123")`.
418
- - **Checkboxes / radio buttons**: use `click(x,y)` directly on the box/circle.
419
- - **Dropdown menus**: click to open, then click the option or use arrow keys + press("enter").
420
- - **Submit buttons**: identify clearly labelled "Sign up", "Sign in", "Submit" buttons and click at their normalized coordinates.
421
- - **Captcha or 2FA**: wait or prompt for external handling (do not bypass security).
422
- - **Pop-ups**: try `press("enter")` to confirm or `press("escape")` to close if they block your action.
423
-
424
- ### Grouping Multiple Inputs
425
- - If multiple fields, checkboxes, or similar controls are clearly visible **and** can be filled/clicked in sequence without ambiguity, you may include **several actions in one code block**.
426
- - Keep each Action still in a **Python code block** but with multiple tool calls inside, for example:
427
- ```python
428
- click(0.47, 0.30) # Email field (normalized coordinates)
429
- wait(0.1)
430
- write("[email protected]")
431
- click(0.47, 0.35) # Password field (normalized coordinates)
432
- wait(0.1)
433
- write("mypassword123")
434
- click(0.45, 0.55) # Checkbox "Accept terms" (normalized coordinates)
435
- wait(0.1)
436
- ```<end_code>
437
- - Only group actions when:
438
- 1. They're all part of the **same form or step**,
439
- 2. The screenshot clearly shows all elements and coordinates,
440
- 3. The order of operations is obvious.
441
- - Otherwise, default back to one Action per step.
442
-
443
- ### Precision
444
- - Always **click before typing** to ensure the right field is active.
445
- - Always **scroll if needed** to bring elements into view before clicking.
446
- - Always **validate each action** via the screenshot before continuing.
447
- - Always use **normalized coordinates between 0.0 and 1.0**.
448
- </web_form_guidelines>
449
-
450
- <task_resolution_example>
451
- For a task like "Sign up for an account and submit the form":
452
-
453
- Step 1:
454
- Short term goal: I want to open the signup page.
455
- What I see: The browser is open on the homepage.
456
- Reflection: I will open the signup URL directly.
457
- Action:
458
- ```python
459
- open("https://example.com/signup")
460
- wait(3)
461
- ```<end_code>
462
-
463
- Step 2:
464
- Short term goal: I want to fill the "Email" field.
465
- What I see: I see the signup form with an "Email" field roughly in the center-left of the screen.
466
- Reflection: I will click inside the field (approximately 0.47, 0.30 in normalized coordinates) then type my email.
467
- Action:
468
- ```python
469
- click(0.47, 0.30)
470
- write("[email protected]")
471
- ```<end_code>
472
-
473
- Step 3:
474
- Short term goal: I want to check the "I accept terms" checkbox.
475
- What I see: The checkbox is in the lower portion of the form, around 0.45, 0.55 in normalized coordinates.
476
- Reflection: I will click it.
477
- Action:
478
- ```python
479
- click(0.45, 0.55)
480
- ```<end_code>
481
-
482
- Step 4:
483
- Short term goal: I want to submit the form.
484
- What I see: The "Sign Up" button is at the bottom center, around 0.52, 0.65 in normalized coordinates.
485
- Reflection: I will click the button to submit.
486
- Action:
487
- ```python
488
- click(0.52, 0.65)
489
- wait(3)
490
- ```<end_code>
491
-
492
- Step 5:
493
- Short term goal: Verify signup completed.
494
- What I see: A confirmation page "Welcome [email protected]".
495
- Reflection: Task complete.
496
- Action:
497
- ```python
498
- final_answer("Signup completed")
499
- ```<end_code>
500
- </task_resolution_example>
501
-
502
- <general_guidelines>
503
- # GUI Agent Guidelines for Web Forms (Normalized Coordinates)
504
-
505
- ## Environment Overview
506
- Ubuntu 22.04 XFCE4 desktop with Google Chrome/Chromium browser.
507
- Agent can fill forms, sign up, sign in, click checkboxes, submit claims.
508
- **All coordinates are normalized between 0.0 and 1.0.**
509
-
510
- ## Core Principles
511
-
512
- ### 1. Screenshot Analysis
513
- - Always analyze the latest screenshot carefully before each action.
514
- - Validate that previous actions worked by examining the current state.
515
- - If an action didn't work, try an alternative rather than repeating blindly.
516
-
517
- ### 2. Action Execution
518
- - Execute one action at a time.
519
- - Wait for appropriate loading times using `wait()` but not indefinitely.
520
- - Scroll to bring hidden elements into view.
521
-
522
- ### 3. Coordinate System
523
- - **CRITICAL**: Always use normalized coordinates (0.0 to 1.0)
524
- - Convert visual position on screen to normalized coordinates
525
- - Center of screen = (0.5, 0.5)
526
- - Top-left = (0.0, 0.0), Bottom-right = (1.0, 1.0)
527
-
528
- ### 4. Keyboard Shortcuts
529
- - Use `tab` to move between fields, `space` to toggle checkboxes, `enter` to submit forms.
530
- - Copy/paste: `ctrl+C`, `ctrl+V`.
531
- - Refresh page: `refresh()`.
532
-
533
- ### 5. Error Recovery
534
- - If clicking doesn't work, try double_click or right_click.
535
- - If typing doesn't appear, ensure the field is focused with click.
536
- - If popups block the screen, try `press("enter")` or `press("escape")`.
537
-
538
- ### 6. Security & Privacy
539
- - Don't attempt to bypass captchas or 2FA automatically.
540
- - Don't store credentials in plain text unless instructed.
541
-
542
- ### 7. Final Answer
543
- - When the form is successfully submitted or the goal achieved, use:
544
- ```python
545
- final_answer("Done")
546
- ```<end_code>
547
- </general_guidelines>
548
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cua2-core/src/cua2-core/services/models/anthropic.py DELETED
@@ -1,10 +0,0 @@
1
- from smolagents import LiteLLMModel
2
-
3
-
4
- class AnthropicModel(LiteLLMModel):
5
- """Anthropic model"""
6
-
7
- MODEL_TYPE = "anthropic"
8
-
9
- def __init__(self, model_id: str):
10
- super().__init__(model_id=model_id)
 
 
 
 
 
 
 
 
 
 
 
cua2-core/src/cua2-core/services/models/gemini.py DELETED
File without changes
cua2-core/src/cua2-core/services/models/get_model.py DELETED
@@ -1,12 +0,0 @@
1
- from smolagents import Model
2
-
3
- from backend.models.models import AgentType
4
- from backend.services.models.anthropic import AnthropicModel
5
-
6
-
7
- def get_model(model_id: str) -> tuple[Model, AgentType]:
8
- """Get the model"""
9
- if "sonnet" in model_id:
10
- return AnthropicModel(model_id=model_id), AgentType.PIXEL_COORDINATES
11
- else:
12
- raise ValueError(f"Model {model_id} not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
cua2-core/src/cua2-core/services/models/qwen.py DELETED
File without changes
cua2-core/src/{cua2-core β†’ cua2_core}/__init__.py RENAMED
File without changes
cua2-core/src/{cua2-core β†’ cua2_core}/app.py RENAMED
@@ -4,8 +4,8 @@ from dotenv import load_dotenv
4
  from fastapi import FastAPI
5
  from fastapi.middleware.cors import CORSMiddleware
6
 
7
- from backend.services.agent_service import AgentService
8
- from backend.websocket.websocket_manager import WebSocketManager
9
 
10
  # Load environment variables
11
  load_dotenv()
 
4
  from fastapi import FastAPI
5
  from fastapi.middleware.cors import CORSMiddleware
6
 
7
+ from cua2_core.services.agent_service import AgentService
8
+ from cua2_core.websocket.websocket_manager import WebSocketManager
9
 
10
  # Load environment variables
11
  load_dotenv()
cua2-core/src/{cua2-core β†’ cua2_core}/main.py RENAMED
@@ -1,10 +1,9 @@
1
  import os
2
 
3
  import uvicorn
4
-
5
- from backend.app import app
6
- from backend.routes.routes import router
7
- from backend.routes.websocket import router as websocket_router
8
 
9
  # Include routes
10
  app.include_router(router, prefix="/api/v1")
@@ -14,7 +13,7 @@ app.include_router(websocket_router)
14
  # Health check endpoint (without prefix)
15
  @app.get("/health")
16
  async def health():
17
- return {"status": "healthy", "service": "computer-use-studio-backend"}
18
 
19
 
20
  if __name__ == "__main__":
@@ -29,9 +28,10 @@ if __name__ == "__main__":
29
  print(f"WebSocket endpoint: ws://{host}:{port}/ws")
30
 
31
  uvicorn.run(
32
- "backend.app:app",
33
  host=host,
34
  port=port,
35
- reload=debug,
 
36
  log_level="info" if not debug else "debug",
37
  )
 
1
  import os
2
 
3
  import uvicorn
4
+ from cua2_core.app import app
5
+ from cua2_core.routes.routes import router
6
+ from cua2_core.routes.websocket import router as websocket_router
 
7
 
8
  # Include routes
9
  app.include_router(router, prefix="/api/v1")
 
13
  # Health check endpoint (without prefix)
14
  @app.get("/health")
15
  async def health():
16
+ return {"status": "healthy", "service": "cua2-core"}
17
 
18
 
19
  if __name__ == "__main__":
 
28
  print(f"WebSocket endpoint: ws://{host}:{port}/ws")
29
 
30
  uvicorn.run(
31
+ "cua2_core.app:app",
32
  host=host,
33
  port=port,
34
+ # reload=debug,
35
+ reload=True,
36
  log_level="info" if not debug else "debug",
37
  )
cua2-core/src/cua2_core/models/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ """Models module for CUA2 Core"""
2
+
cua2-core/src/cua2_core/models/models.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime
4
+ from enum import Enum
5
+ from typing import Annotated, Literal, TypeAlias
6
+
7
+ from pydantic import BaseModel, Field, field_serializer, model_validator
8
+
9
+ #################### Backend -> Frontend ########################
10
+
11
+ class AgentAction(BaseModel):
12
+ """Agent action structure"""
13
+
14
+ actionType: Literal["click", "write", "press", "scroll", "wait", "open", "launch_app", "refresh", "go_back"]
15
+ actionArguments: dict
16
+
17
+ def to_string(self) -> str:
18
+ """Convert action to a human-readable string"""
19
+ action_type = self.actionType
20
+ args = self.actionArguments
21
+
22
+ if action_type == "click":
23
+ x = args.get("x", "?")
24
+ y = args.get("y", "?")
25
+ return f"Click at coordinates ({x}, {y})"
26
+
27
+ elif action_type == "write":
28
+ text = args.get("text", "")
29
+ return f"Type text: '{text}'"
30
+
31
+ elif action_type == "press":
32
+ key = args.get("key", "")
33
+ return f"Press key: {key}"
34
+
35
+ elif action_type == "scroll":
36
+ direction = args.get("direction", "down")
37
+ amount = args.get("amount", 2)
38
+ return f"Scroll {direction} by {amount}"
39
+
40
+ elif action_type == "wait":
41
+ seconds = args.get("seconds", 0)
42
+ return f"Wait for {seconds} seconds"
43
+
44
+ elif action_type == "open":
45
+ file_or_url = args.get("file_or_url", "")
46
+ return f"Open: {file_or_url}"
47
+
48
+ elif action_type == "launch_app":
49
+ app_name = args.get("app_name", "")
50
+ return f"Launch app: {app_name}"
51
+
52
+ elif action_type == "refresh":
53
+ return "Refresh the current page"
54
+
55
+ elif action_type == "go_back":
56
+ return "Go back one page"
57
+
58
+
59
+ class AgentStep(BaseModel):
60
+ """Agent step structure"""
61
+
62
+ traceId: str
63
+ stepId: str
64
+ image: str
65
+ thought: str
66
+ actions: list[AgentAction]
67
+ timeTaken: float
68
+ inputTokensUsed: int
69
+ outputTokensUsed: int
70
+ timestamp: datetime
71
+ step_evaluation: Literal['like', 'dislike', 'neutral']
72
+
73
+ @field_serializer('actions')
74
+ def serialize_actions(self, actions: list[AgentAction], _info):
75
+ """Convert actions to list of strings when dumping (controlled by context)"""
76
+
77
+ if _info.context and _info.context.get('actions_as_json', False):
78
+ return [action.model_dump(mode="json") for action in actions]
79
+
80
+ return [action.to_string() for action in actions]
81
+
82
+
83
+ class AgentTraceMetadata(BaseModel):
84
+ """Metadata for agent execution"""
85
+
86
+ traceId: str = ""
87
+ inputTokensUsed: int = 0
88
+ outputTokensUsed: int = 0
89
+ timeTaken: float = 0.0 # in seconds
90
+ numberOfSteps: int = 0
91
+
92
+
93
+ class AgentTrace(BaseModel):
94
+ """Agent message structure"""
95
+
96
+ id: str
97
+ timestamp: datetime
98
+ instruction: str
99
+ modelId: str
100
+ isRunning: bool
101
+ steps: list[AgentStep] = []
102
+ traceMetadata: AgentTraceMetadata = AgentTraceMetadata()
103
+
104
+ @model_validator(mode="after")
105
+ def validate_trace(self):
106
+ """Validate trace"""
107
+ if not self.steps:
108
+ self.steps = []
109
+ if not self.traceMetadata:
110
+ self.traceMetadata = AgentTraceMetadata()
111
+ return self
112
+
113
+
114
+ #################### WebSocket Events ########################
115
+
116
+
117
+ class AgentStartEvent(BaseModel):
118
+ """Agent start event"""
119
+
120
+ type: Literal["agent_start"] = "agent_start"
121
+ agentTrace: AgentTrace
122
+
123
+
124
+ class AgentProgressEvent(BaseModel):
125
+ """Agent progress event"""
126
+
127
+ type: Literal["agent_progress"] = "agent_progress"
128
+ agentStep: AgentStep
129
+ traceMetadata: AgentTraceMetadata
130
+
131
+
132
+ class AgentCompleteEvent(BaseModel):
133
+ """Agent complete event"""
134
+
135
+ type: Literal["agent_complete"] = "agent_complete"
136
+ traceMetadata: AgentTraceMetadata
137
+
138
+
139
+ class AgentErrorEvent(BaseModel):
140
+ """Agent error event"""
141
+
142
+ type: Literal["agent_error"] = "agent_error"
143
+ error: str
144
+
145
+
146
+ class VncUrlSetEvent(BaseModel):
147
+ """Vnc url set event"""
148
+
149
+ type: Literal["vnc_url_set"] = "vnc_url_set"
150
+ vncUrl: str
151
+
152
+
153
+ class VncUrlUnsetEvent(BaseModel):
154
+ """Vnc url unset event"""
155
+
156
+ type: Literal["vnc_url_unset"] = "vnc_url_unset"
157
+
158
+
159
+ class HeartbeatEvent(BaseModel):
160
+ """Heartbeat event"""
161
+
162
+ type: Literal["heartbeat"] = "heartbeat"
163
+
164
+
165
+ WebSocketEvent: TypeAlias = Annotated[
166
+ AgentStartEvent
167
+ | AgentProgressEvent
168
+ | AgentCompleteEvent
169
+ | AgentErrorEvent
170
+ | VncUrlSetEvent
171
+ | VncUrlUnsetEvent
172
+ | HeartbeatEvent,
173
+ Field(discriminator="type"),
174
+ ]
175
+
176
+
177
+ #################### Frontend -> Backend ########################
178
+
179
+
180
+ class UserTaskMessage(BaseModel):
181
+ """Message sent from frontend to backend"""
182
+
183
+ event_type: Literal["user_task"]
184
+ agent_trace: AgentTrace | None = None
185
+
186
+
187
+ ##################### Agent Service ########################
188
+
189
+
190
+ class ActiveTask(BaseModel):
191
+ """Active task"""
192
+
193
+ message_id: str
194
+ instruction: str
195
+ modelId: str
196
+ timestamp: datetime = datetime.now()
197
+ steps: list[AgentStep] = []
198
+ traceMetadata: AgentTraceMetadata = AgentTraceMetadata()
199
+
200
+ @property
201
+ def trace_path(self):
202
+ """Trace path"""
203
+ return f"data/trace-{self.message_id}-{self.modelId}"
204
+
205
+ @model_validator(mode="after")
206
+ def store_model(self):
207
+ """Validate model ID"""
208
+ self.traceMetadata.traceId = self.message_id
209
+ os.makedirs(self.trace_path, exist_ok=True)
210
+ with open(f"{self.trace_path}/tasks.json", "w") as f:
211
+ json.dump(self.model_dump(mode="json", context={"actions_as_json": True}), f, indent=2)
212
+
213
+ return self
214
+
215
+
216
+ class HealthResponse(BaseModel):
217
+ """Health check response"""
218
+
219
+ status: str
220
+ timestamp: datetime
221
+ websocket_connections: int
cua2-core/src/cua2_core/routes/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ """Routes module for CUA2 Core"""
2
+
cua2-core/src/{cua2-core β†’ cua2_core}/routes/routes.py RENAMED
@@ -3,9 +3,9 @@ from datetime import datetime
3
  from fastapi import APIRouter, Depends, HTTPException, Request
4
 
5
  # Get services from app state
6
- from backend.models.models import HealthResponse
7
- from backend.services.agent_service import AgentService
8
- from backend.websocket.websocket_manager import WebSocketManager
9
 
10
  # Create router
11
  router = APIRouter()
 
3
  from fastapi import APIRouter, Depends, HTTPException, Request
4
 
5
  # Get services from app state
6
+ from cua2_core.models.models import HealthResponse
7
+ from cua2_core.services.agent_service import AgentService
8
+ from cua2_core.websocket.websocket_manager import WebSocketManager
9
 
10
  # Create router
11
  router = APIRouter()
cua2-core/src/{cua2-core β†’ cua2_core}/routes/websocket.py RENAMED
@@ -3,8 +3,8 @@ import json
3
  from fastapi import APIRouter, WebSocket, WebSocketDisconnect
4
 
5
  # Get services from app state
6
- from backend.app import app
7
- from backend.models.models import UserTaskMessage, WebSocketEvent
8
 
9
  # Create router
10
  router = APIRouter()
@@ -20,11 +20,8 @@ async def websocket_endpoint(websocket: WebSocket):
20
  await websocket_manager.connect(websocket)
21
 
22
  try:
23
- welcome_message = WebSocketEvent(
24
- type="heartbeat",
25
- content="WebSocket connection established successfully",
26
- messageId="connection_welcome",
27
- )
28
  await websocket_manager.send_personal_message(welcome_message, websocket)
29
 
30
  # Keep the connection alive and wait for messages
@@ -36,27 +33,32 @@ async def websocket_endpoint(websocket: WebSocket):
36
  try:
37
  # Parse the message
38
  message_data = json.loads(data)
39
- message = UserTaskMessage(**message_data)
40
-
41
- # Process the user task
42
- if message.type == "user_task":
43
- message_id = await agent_service.process_user_task(
44
- message.content, message.model_id
45
- )
46
-
47
- # Send acknowledgment back to the client
48
- response = WebSocketEvent(
49
- type="agent_start",
50
- content=f"Received task: {message.content}",
51
- messageId=message_id,
52
- )
53
- await websocket_manager.send_personal_message(
54
- response, websocket
55
- )
56
-
57
- except json.JSONDecodeError:
58
- error_response = WebSocketEvent(
59
- type="agent_error", content="Invalid JSON format"
 
 
 
 
 
60
  )
61
  await websocket_manager.send_personal_message(
62
  error_response, websocket
@@ -64,9 +66,12 @@ async def websocket_endpoint(websocket: WebSocket):
64
 
65
  except Exception as e:
66
  print(f"Error processing message: {e}")
67
- error_response = WebSocketEvent(
 
 
 
68
  type="agent_error",
69
- content=f"Error processing message: {str(e)}",
70
  )
71
  await websocket_manager.send_personal_message(
72
  error_response, websocket
 
3
  from fastapi import APIRouter, WebSocket, WebSocketDisconnect
4
 
5
  # Get services from app state
6
+ from cua2_core.app import app
7
+ from cua2_core.models.models import UserTaskMessage, AgentTrace, HeartbeatEvent
8
 
9
  # Create router
10
  router = APIRouter()
 
20
  await websocket_manager.connect(websocket)
21
 
22
  try:
23
+ # Send welcome heartbeat
24
+ welcome_message = HeartbeatEvent(type="heartbeat")
 
 
 
25
  await websocket_manager.send_personal_message(welcome_message, websocket)
26
 
27
  # Keep the connection alive and wait for messages
 
33
  try:
34
  # Parse the message
35
  message_data = json.loads(data)
36
+ print(f"Received message: {message_data}")
37
+
38
+ # Check if it's a user task message
39
+ if message_data.get("type") == "user_task":
40
+ # Extract and parse the trace
41
+ trace_data = message_data.get("trace")
42
+ if trace_data:
43
+ # Convert timestamp string to datetime if needed
44
+ if isinstance(trace_data.get("timestamp"), str):
45
+ from datetime import datetime
46
+ trace_data["timestamp"] = datetime.fromisoformat(trace_data["timestamp"].replace("Z", "+00:00"))
47
+
48
+ trace = AgentTrace(**trace_data)
49
+
50
+ # Process the user task with the trace
51
+ trace_id = await agent_service.process_user_task(trace)
52
+ print(f"Started processing trace: {trace_id}")
53
+ else:
54
+ print("No trace data in message")
55
+
56
+ except json.JSONDecodeError as e:
57
+ print(f"JSON decode error: {e}")
58
+ from cua2_core.models.models import AgentErrorEvent
59
+ error_response = AgentErrorEvent(
60
+ type="agent_error",
61
+ error="Invalid JSON format"
62
  )
63
  await websocket_manager.send_personal_message(
64
  error_response, websocket
 
66
 
67
  except Exception as e:
68
  print(f"Error processing message: {e}")
69
+ import traceback
70
+ traceback.print_exc()
71
+ from cua2_core.models.models import AgentErrorEvent
72
+ error_response = AgentErrorEvent(
73
  type="agent_error",
74
+ error=f"Error processing message: {str(e)}"
75
  )
76
  await websocket_manager.send_personal_message(
77
  error_response, websocket
cua2-core/src/cua2_core/services/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ """Services module for CUA2 Core"""
2
+
cua2-core/src/cua2_core/services/agent_service.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import base64
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ from cua2_core.models.models import (
9
+ ActiveTask,
10
+ AgentTrace,
11
+ AgentStep,
12
+ AgentAction,
13
+ AgentTraceMetadata,
14
+ AgentStartEvent,
15
+ AgentProgressEvent,
16
+ AgentCompleteEvent,
17
+ AgentErrorEvent,
18
+ VncUrlSetEvent,
19
+ VncUrlUnsetEvent,
20
+ )
21
+ from cua2_core.websocket.websocket_manager import WebSocketManager
22
+
23
+
24
+ class AgentService:
25
+ """Service for handling agent tasks and processing"""
26
+
27
+ def __init__(self, websocket_manager):
28
+ self.active_tasks: dict[str, ActiveTask] = {}
29
+ self.websocket_manager: WebSocketManager = websocket_manager
30
+ self.simulation_data_path = Path(__file__).parent / "simulation_metadata" / "simulated_trace.json"
31
+ self.simulation_images_path = Path(__file__).parent / "simulation_metadata" / "images"
32
+
33
+ async def process_user_task(self, trace: AgentTrace) -> str:
34
+ """Process a user task and return the trace ID"""
35
+
36
+ trace_id = trace.id
37
+ trace.steps = []
38
+ trace.traceMetadata = AgentTraceMetadata(traceId=trace_id)
39
+
40
+ # Store the task
41
+ self.active_tasks[trace_id] = ActiveTask(
42
+ message_id=trace_id,
43
+ instruction=trace.instruction,
44
+ modelId=trace.modelId,
45
+ timestamp=trace.timestamp,
46
+ steps=trace.steps,
47
+ traceMetadata=trace.traceMetadata,
48
+ )
49
+
50
+ # Start the agent processing in the background
51
+ asyncio.create_task(
52
+ self._simulate_agent_processing(trace)
53
+ )
54
+
55
+ return trace_id
56
+
57
+
58
+ async def _simulate_agent_processing(self, trace: AgentTrace):
59
+ """Simulate agent processing using simulated_trace.json data"""
60
+ trace_id = trace.id
61
+
62
+ try:
63
+ # Load simulation data
64
+ with open(self.simulation_data_path, 'r') as f:
65
+ simulation_data = json.load(f)
66
+
67
+ # Send agent start event with the initial trace
68
+ start_event = AgentStartEvent(
69
+ type="agent_start",
70
+ agentTrace=trace
71
+ )
72
+ await self.websocket_manager.broadcast(start_event)
73
+
74
+ # mock VNC URL
75
+ vnc_url = "https://www.youtube.com/embed/VCutEsRSJ5A?si=PT0ETJ7zIJ9ywhGW"
76
+ vnc_set_event = VncUrlSetEvent(
77
+ type="vnc_url_set",
78
+ vncUrl=vnc_url
79
+ )
80
+ await self.websocket_manager.broadcast(vnc_set_event)
81
+
82
+ trace_metadata = AgentTraceMetadata(traceId=trace_id)
83
+
84
+ # Process each step from the simulation data
85
+ for step_data in simulation_data["steps"]:
86
+ # Wait before sending the next step to simulate processing time
87
+ await asyncio.sleep(step_data["timeTaken"])
88
+
89
+ # Load and encode the image
90
+ image_path = self.simulation_images_path / step_data["image"].split("/")[-1]
91
+ with open(image_path, 'rb') as img_file:
92
+ image_bytes = img_file.read()
93
+ image_base64 = f"data:image/png;base64,{base64.b64encode(image_bytes).decode('utf-8')}"
94
+
95
+ # Convert actions to AgentAction objects
96
+ actions = [
97
+ AgentAction(
98
+ actionType=action["actionType"],
99
+ actionArguments=action["actionArguments"]
100
+ )
101
+ for action in step_data["actions"]
102
+ ]
103
+
104
+ # Create agent step
105
+ agent_step = AgentStep(
106
+ traceId=trace_id,
107
+ stepId=step_data["stepId"],
108
+ image=image_base64,
109
+ thought=step_data["thought"],
110
+ actions=actions,
111
+ timeTaken=step_data["timeTaken"],
112
+ inputTokensUsed=step_data["inputTokensUsed"],
113
+ outputTokensUsed=step_data["outputTokensUsed"],
114
+ timestamp=datetime.fromisoformat(step_data["timestamp"].replace("Z", "+00:00")),
115
+ step_evaluation=step_data["step_evaluation"]
116
+ )
117
+
118
+ trace_metadata.numberOfSteps += 1
119
+ trace_metadata.timeTaken += step_data["timeTaken"]
120
+ trace_metadata.inputTokensUsed += step_data["inputTokensUsed"]
121
+ trace_metadata.outputTokensUsed += step_data["outputTokensUsed"]
122
+
123
+ # Send progress event
124
+ progress_event = AgentProgressEvent(
125
+ type="agent_progress",
126
+ agentStep=agent_step,
127
+ traceMetadata=trace_metadata
128
+ )
129
+ await self.websocket_manager.broadcast(progress_event)
130
+
131
+ # Update active task
132
+ self.active_tasks[trace_id].steps.append(agent_step)
133
+
134
+ # Unset VNC URL before completion
135
+ vnc_unset_event = VncUrlUnsetEvent(type="vnc_url_unset")
136
+ await self.websocket_manager.broadcast(vnc_unset_event)
137
+
138
+ # Send completion event
139
+ complete_event = AgentCompleteEvent(
140
+ type="agent_complete",
141
+ traceMetadata=trace_metadata
142
+ )
143
+ await self.websocket_manager.broadcast(complete_event)
144
+
145
+ # Update active task with final metadata
146
+ self.active_tasks[trace_id].traceMetadata = trace_metadata
147
+
148
+ # Clean up after a delay
149
+ await asyncio.sleep(1)
150
+ if trace_id in self.active_tasks:
151
+ del self.active_tasks[trace_id]
152
+
153
+ except Exception as e:
154
+ print(f"Error in agent simulation: {str(e)}")
155
+ # Send error event
156
+ error_event = AgentErrorEvent(
157
+ type="agent_error",
158
+ error=f"Error processing task: {str(e)}"
159
+ )
160
+ await self.websocket_manager.broadcast(error_event)
161
+
162
+ # Clean up
163
+ if trace_id in self.active_tasks:
164
+ del self.active_tasks[trace_id]
165
+
166
+ def get_active_tasks(self) -> dict:
167
+ """Get currently active tasks"""
168
+ return self.active_tasks.copy()
169
+
170
+ def get_task_status(self, message_id: str) -> Optional[dict]:
171
+ """Get status of a specific task"""
172
+ return self.active_tasks.get(message_id)
cua2-core/src/cua2_core/services/simulation_metadata/simulated_trace.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "steps": [
3
+ {
4
+ "stepId": "step_001",
5
+ "image": "images/step_1.png",
6
+ "thought": "I can see a form with multiple input fields. I need to start by clicking on the first name field to begin filling out the form.",
7
+ "actions": [
8
+ {
9
+ "actionType": "click",
10
+ "actionArguments": {
11
+ "x": 320,
12
+ "y": 180
13
+ }
14
+ }
15
+ ],
16
+ "timeTaken": 2.3,
17
+ "inputTokensUsed": 1250,
18
+ "outputTokensUsed": 85,
19
+ "timestamp": "2025-10-17T14:30:02.300Z",
20
+ "step_evaluation": "like"
21
+ },
22
+ {
23
+ "stepId": "step_002",
24
+ "image": "images/step_2.png",
25
+ "thought": "After clicking the first field, I can see the cursor is active. Now I should proceed to click on the email field to continue with the form submission process.",
26
+ "actions": [
27
+ {
28
+ "actionType": "click",
29
+ "actionArguments": {
30
+ "x": 420,
31
+ "y": 285
32
+ }
33
+ }
34
+ ],
35
+ "timeTaken": 1.8,
36
+ "inputTokensUsed": 1180,
37
+ "outputTokensUsed": 72,
38
+ "timestamp": "2025-10-17T14:30:04.100Z",
39
+ "step_evaluation": "like"
40
+ },
41
+ {
42
+ "stepId": "step_003",
43
+ "image": "images/step_3.png",
44
+ "thought": "The form appears to be mostly filled. I can see a submit button at the bottom of the form. I'll click on it to complete the form submission.",
45
+ "actions": [
46
+ {
47
+ "actionType": "click",
48
+ "actionArguments": {
49
+ "x": 450,
50
+ "y": 520
51
+ }
52
+ }
53
+ ],
54
+ "timeTaken": 1.5,
55
+ "inputTokensUsed": 1100,
56
+ "outputTokensUsed": 68,
57
+ "timestamp": "2025-10-17T14:30:05.600Z",
58
+ "step_evaluation": "like"
59
+ }
60
+ ]
61
+ }
62
+
cua2-core/src/cua2_core/websocket/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ """WebSocket module for CUA2 Core"""
2
+
cua2-core/src/{cua2-core β†’ cua2_core}/websocket/websocket_manager.py RENAMED
@@ -4,7 +4,7 @@ from typing import Dict, Optional, Set
4
 
5
  from fastapi import WebSocket
6
 
7
- from backend.models.models import AgentMetadata, WebSocketEvent
8
 
9
 
10
  class WebSocketManager:
@@ -35,7 +35,7 @@ class WebSocketManager:
35
  ):
36
  """Send a message to a specific WebSocket connection"""
37
  try:
38
- await websocket.send_text(json.dumps(message.model_dump()))
39
  except Exception as e:
40
  print(f"Error sending personal message: {e}")
41
  # Only disconnect if the connection is still in our set
@@ -52,7 +52,7 @@ class WebSocketManager:
52
 
53
  for connection in self.active_connections.copy():
54
  try:
55
- await connection.send_text(json.dumps(message.model_dump()))
56
  except Exception as e:
57
  print(f"Error broadcasting to connection: {e}")
58
  disconnected.append(connection)
@@ -77,7 +77,7 @@ class WebSocketManager:
77
  await self.broadcast(event)
78
 
79
  async def send_agent_complete(
80
- self, content: str, message_id: str, metadata: Optional[AgentMetadata] = None
81
  ):
82
  """Send agent complete event"""
83
  event = WebSocketEvent(
 
4
 
5
  from fastapi import WebSocket
6
 
7
+ from cua2_core.models.models import AgentTraceMetadata, WebSocketEvent
8
 
9
 
10
  class WebSocketManager:
 
35
  ):
36
  """Send a message to a specific WebSocket connection"""
37
  try:
38
+ await websocket.send_text(json.dumps(message.model_dump(mode="json")))
39
  except Exception as e:
40
  print(f"Error sending personal message: {e}")
41
  # Only disconnect if the connection is still in our set
 
52
 
53
  for connection in self.active_connections.copy():
54
  try:
55
+ await connection.send_text(json.dumps(message.model_dump(mode="json")))
56
  except Exception as e:
57
  print(f"Error broadcasting to connection: {e}")
58
  disconnected.append(connection)
 
77
  await self.broadcast(event)
78
 
79
  async def send_agent_complete(
80
+ self, content: str, message_id: str, metadata: Optional[AgentTraceMetadata] = None
81
  ):
82
  """Send agent complete event"""
83
  event = WebSocketEvent(