Amir Mahla commited on
Commit
af1ae43
·
0 Parent(s):
Files changed (36) hide show
  1. .gitignore +226 -0
  2. cua2-core/env.example +11 -0
  3. cua2-core/pyproject.toml +93 -0
  4. cua2-core/src/__init__.py +1 -0
  5. cua2-core/src/cua2-core/__init__.py +1 -0
  6. cua2-core/src/cua2-core/app.py +64 -0
  7. cua2-core/src/cua2-core/main.py +37 -0
  8. cua2-core/src/cua2-core/models/models.py +95 -0
  9. cua2-core/src/cua2-core/routes/routes.py +56 -0
  10. cua2-core/src/cua2-core/routes/websocket.py +86 -0
  11. cua2-core/src/cua2-core/services/agent_service.py +130 -0
  12. cua2-core/src/cua2-core/services/agents/get_agents.py +57 -0
  13. cua2-core/src/cua2-core/services/agents/normalized_1000_agent.py +293 -0
  14. cua2-core/src/cua2-core/services/agents/normalized_agent.py +282 -0
  15. cua2-core/src/cua2-core/services/agents/pixel_coordonates_agent.py +317 -0
  16. cua2-core/src/cua2-core/services/agents/prompt.py +548 -0
  17. cua2-core/src/cua2-core/services/models/anthropic.py +10 -0
  18. cua2-core/src/cua2-core/services/models/gemini.py +0 -0
  19. cua2-core/src/cua2-core/services/models/get_model.py +12 -0
  20. cua2-core/src/cua2-core/services/models/qwen.py +0 -0
  21. cua2-core/src/cua2-core/websocket/websocket_manager.py +117 -0
  22. cua2-front/.gitignore +24 -0
  23. cua2-front/index.html +14 -0
  24. cua2-front/package-lock.json +0 -0
  25. cua2-front/package.json +33 -0
  26. cua2-front/src/App.tsx +15 -0
  27. cua2-front/src/hooks/useWebSocket.ts +154 -0
  28. cua2-front/src/index.css +20 -0
  29. cua2-front/src/main.tsx +5 -0
  30. cua2-front/src/pages/Index.tsx +132 -0
  31. cua2-front/src/types/agent.ts +36 -0
  32. cua2-front/src/vite-env.d.ts +1 -0
  33. cua2-front/tsconfig.app.json +35 -0
  34. cua2-front/tsconfig.json +16 -0
  35. cua2-front/tsconfig.node.json +22 -0
  36. cua2-front/vite.config.ts +17 -0
.gitignore ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # Abstra
171
+ # Abstra is an AI-powered process automation framework.
172
+ # Ignore directories containing user credentials, local state, and settings.
173
+ # Learn more at https://abstra.io/docs
174
+ .abstra/
175
+
176
+ # Visual Studio Code
177
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
178
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
179
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
180
+ # you could uncomment the following to ignore the enitre vscode folder
181
+ # .vscode/
182
+
183
+ # Ruff stuff:
184
+ .ruff_cache/
185
+
186
+ # PyPI configuration file
187
+ .pypirc
188
+
189
+ # Cursor
190
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
191
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
192
+ # refer to https://docs.cursor.com/context/ignore-files
193
+ .cursorignore
194
+ .cursorindexingignore
195
+
196
+ gui_agent_demo.mp4
197
+
198
+ recording.mp4
199
+
200
+ uv.lock
201
+ .DS_Store
202
+
203
+ # Logs
204
+ logs
205
+ *.log
206
+ npm-debug.log*
207
+ yarn-debug.log*
208
+ yarn-error.log*
209
+ pnpm-debug.log*
210
+ lerna-debug.log*
211
+
212
+ node_modules
213
+ dist
214
+ dist-ssr
215
+ *.local
216
+
217
+ # Editor directories and files
218
+ .vscode/*
219
+ !.vscode/extensions.json
220
+ .idea
221
+ .DS_Store
222
+ *.suo
223
+ *.ntvs*
224
+ *.njsproj
225
+ *.sln
226
+ *.sw?
cua2-core/env.example ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment Configuration
2
+ HOST=0.0.0.0
3
+ PORT=8000
4
+ DEBUG=true
5
+
6
+ # Agent Configuration
7
+ AGENT_TIMEOUT=300
8
+ MAX_CONCURRENT_TASKS=5
9
+
10
+ # Logging
11
+ LOG_LEVEL=INFO
cua2-core/pyproject.toml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "cua2-core"
7
+ version = "0.0.0-dev.0"
8
+ description = "Backend API server for Computer Use Agent"
9
+ readme = "README.md"
10
+ authors = [{ name = "Amir Mahla", email = "[email protected]" }]
11
+ keywords = ["fastapi", "api", "backend", "automation"]
12
+ classifiers = [
13
+ "Development Status :: 4 - Beta",
14
+ "Intended Audience :: Developers",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Operating System :: OS Independent",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.10",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Topic :: Internet :: WWW/HTTP :: HTTP Servers",
22
+ "Topic :: Software Development :: Libraries :: Application Frameworks",
23
+ ]
24
+ requires-python = ">=3.10"
25
+ dependencies = [
26
+ "fastapi>=0.115.13",
27
+ "uvicorn[standard]>=0.29.0,<0.30.0",
28
+ "websockets>=13.1.0,<14.0.0",
29
+ "pydantic>=2.11.7",
30
+ "python-multipart>=0.0.18,<0.0.19",
31
+ "python-jose[cryptography]==3.3.0",
32
+ "passlib[bcrypt]==1.7.4",
33
+ "python-dotenv==1.0.0",
34
+ "httpx>=0.27.1",
35
+ "asyncio-mqtt==0.16.1",
36
+ "aiofiles==23.2.1",
37
+ "smolagents[openai,litellm]==1.15.0",
38
+ "openai==1.91.0",
39
+ "litellm[proxy]==1.63.14",
40
+ ]
41
+
42
+ [project.optional-dependencies]
43
+ dev = [
44
+ "pytest>=7.0.0",
45
+ "pytest-asyncio>=0.21.0",
46
+ "pytest-cov>=4.0.0",
47
+ "black>=23.0.0",
48
+ "isort>=5.12.0",
49
+ "flake8>=6.0.0",
50
+ "mypy>=1.0.0",
51
+ "pre-commit>=3.0.0",
52
+ ]
53
+ test = [
54
+ "pytest>=7.0.0",
55
+ "pytest-asyncio>=0.21.0",
56
+ "pytest-cov>=4.0.0",
57
+ ]
58
+
59
+ [project.urls]
60
+ Homepage = "https://github.com/huggingface/CUA2"
61
+ Repository = "https://github.com/huggingface/CUA2"
62
+
63
+ [tool.hatch.build.targets.wheel]
64
+ packages = ["src/cua2-core"]
65
+
66
+ [tool.hatch.build.targets.sdist]
67
+ include = [
68
+ "/src",
69
+ "/README.md",
70
+ ]
71
+
72
+ [tool.coverage.run]
73
+ source = ["src"]
74
+ omit = [
75
+ "*/tests/*",
76
+ "*/test_*",
77
+ "*/__pycache__/*",
78
+ "*/migrations/*",
79
+ ]
80
+
81
+ [tool.coverage.report]
82
+ exclude_lines = [
83
+ "pragma: no cover",
84
+ "def __repr__",
85
+ "if self.debug:",
86
+ "if settings.DEBUG",
87
+ "raise AssertionError",
88
+ "raise NotImplementedError",
89
+ "if 0:",
90
+ "if __name__ == .__main__.:",
91
+ "class .*\\bProtocol\\):",
92
+ "@(abc\\.)?abstractmethod",
93
+ ]
cua2-core/src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Backend package
cua2-core/src/cua2-core/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Backend package
cua2-core/src/cua2-core/app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import asynccontextmanager
2
+
3
+ from dotenv import load_dotenv
4
+ from fastapi import FastAPI
5
+ from fastapi.middleware.cors import CORSMiddleware
6
+
7
+ from backend.services.agent_service import AgentService
8
+ from backend.websocket.websocket_manager import WebSocketManager
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
+
13
+
14
+ @asynccontextmanager
15
+ async def lifespan(app: FastAPI):
16
+ """Lifespan context manager for startup and shutdown events"""
17
+ # Startup: Initialize services
18
+ print("Initializing services...")
19
+
20
+ # Initialize WebSocket manager
21
+ websocket_manager = WebSocketManager()
22
+
23
+ # Initialize agent service with websocket manager dependency
24
+ agent_service = AgentService(websocket_manager)
25
+
26
+ # Store services in app state for access in routes
27
+ app.state.websocket_manager = websocket_manager
28
+ app.state.agent_service = agent_service
29
+
30
+ print("Services initialized successfully")
31
+
32
+ yield
33
+
34
+ # Shutdown: Clean up resources
35
+ print("Shutting down services...")
36
+ # Add any cleanup logic here if needed
37
+ print("Services shut down successfully")
38
+
39
+
40
+ # Create FastAPI app with lifespan
41
+ app = FastAPI(
42
+ title="Computer Use Studio Backend",
43
+ description="Backend API for Computer Use Studio - AI-powered automation interface",
44
+ version="1.0.0",
45
+ docs_url="/docs",
46
+ redoc_url="/redoc",
47
+ lifespan=lifespan,
48
+ )
49
+
50
+ # Configure CORS
51
+ app.add_middleware(
52
+ CORSMiddleware,
53
+ allow_origins=[
54
+ "http://localhost:3000", # React dev server
55
+ "http://localhost:5173", # Vite dev server
56
+ "http://localhost:8080", # Alternative frontend port
57
+ "http://127.0.0.1:3000",
58
+ "http://127.0.0.1:5173",
59
+ "http://127.0.0.1:8080",
60
+ ],
61
+ allow_credentials=True,
62
+ allow_methods=["*"],
63
+ allow_headers=["*"],
64
+ )
cua2-core/src/cua2-core/main.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import uvicorn
4
+
5
+ from backend.app import app
6
+ from backend.routes.routes import router
7
+ from backend.routes.websocket import router as websocket_router
8
+
9
+ # Include routes
10
+ app.include_router(router, prefix="/api/v1")
11
+ app.include_router(websocket_router)
12
+
13
+
14
+ # Health check endpoint (without prefix)
15
+ @app.get("/health")
16
+ async def health():
17
+ return {"status": "healthy", "service": "computer-use-studio-backend"}
18
+
19
+
20
+ if __name__ == "__main__":
21
+ # Get configuration from environment variables
22
+ host = os.getenv("HOST", "0.0.0.0")
23
+ port = int(os.getenv("PORT", 8000))
24
+ debug = os.getenv("DEBUG", "false").lower() == "true"
25
+
26
+ print(f"Starting Computer Use Studio Backend on {host}:{port}")
27
+ print(f"Debug mode: {debug}")
28
+ print(f"API Documentation: http://{host}:{port}/docs")
29
+ print(f"WebSocket endpoint: ws://{host}:{port}/ws")
30
+
31
+ uvicorn.run(
32
+ "backend.app:app",
33
+ host=host,
34
+ port=port,
35
+ reload=debug,
36
+ log_level="info" if not debug else "debug",
37
+ )
cua2-core/src/cua2-core/models/models.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime
4
+ from enum import Enum
5
+ from typing import Literal, Optional
6
+
7
+ from pydantic import BaseModel, model_validator
8
+
9
+
10
+ class AgentMetadata(BaseModel):
11
+ """Metadata for agent execution"""
12
+
13
+ inputTokensUsed: int
14
+ outputTokensUsed: int
15
+ timeTaken: float # in seconds
16
+ numberOfSteps: int
17
+
18
+
19
+ class AgentType(str, Enum):
20
+ """Agent type"""
21
+
22
+ PIXEL_COORDINATES = "pixel_coordinates"
23
+ NORMALIZED_1000_COORDINATES = "normalized_1000_coordinates"
24
+ NORMALIZED_COORDINATES = "normalized_coordinates"
25
+
26
+
27
+ class ActiveTask(BaseModel):
28
+ """Active task"""
29
+
30
+ message_id: str
31
+ content: str
32
+ model_id: str
33
+ start_time: datetime
34
+ status: str
35
+
36
+ @property
37
+ def trace_path(self):
38
+ """Trace path"""
39
+ return f"data/trace-{self.message_id}-{self.model_id}"
40
+
41
+ @model_validator(mode="after")
42
+ def validate_model_id(self):
43
+ """Validate model ID"""
44
+ os.makedirs(self.trace_path, exist_ok=True)
45
+ with open(f"{self.trace_path}/user_tasks.json", "w") as f:
46
+ json.dump(self.model_dump(mode="json"), f, indent=2)
47
+
48
+ return self
49
+
50
+
51
+ class WebSocketEvent(BaseModel):
52
+ """WebSocket event structure"""
53
+
54
+ type: Literal[
55
+ "agent_start",
56
+ "agent_progress",
57
+ "agent_complete",
58
+ "agent_error",
59
+ "vnc_url_set",
60
+ "vnc_url_unset",
61
+ "heartbeat",
62
+ ]
63
+ content: Optional[str] = None
64
+ metadata: Optional[AgentMetadata] = None
65
+ messageId: Optional[str] = None
66
+ vncUrl: Optional[str] = None
67
+
68
+
69
+ class UserTaskMessage(BaseModel):
70
+ """Message sent from frontend to backend"""
71
+
72
+ type: Literal["user_task"]
73
+ content: str
74
+ model_id: str
75
+ timestamp: str
76
+
77
+
78
+ class AgentMessage(BaseModel):
79
+ """Agent message structure"""
80
+
81
+ id: str
82
+ type: Literal["user", "agent"]
83
+ content: str
84
+ timestamp: datetime
85
+ metadata: Optional[AgentMetadata] = None
86
+ isLoading: Optional[bool] = None
87
+ truncated: Optional[bool] = None
88
+
89
+
90
+ class HealthResponse(BaseModel):
91
+ """Health check response"""
92
+
93
+ status: str
94
+ timestamp: datetime
95
+ websocket_connections: int
cua2-core/src/cua2-core/routes/routes.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+
3
+ from fastapi import APIRouter, Depends, HTTPException, Request
4
+
5
+ # Get services from app state
6
+ from backend.models.models import HealthResponse
7
+ from backend.services.agent_service import AgentService
8
+ from backend.websocket.websocket_manager import WebSocketManager
9
+
10
+ # Create router
11
+ router = APIRouter()
12
+
13
+
14
+ def get_websocket_manager(request: Request) -> WebSocketManager:
15
+ """Dependency to get WebSocket manager from app state"""
16
+ return request.app.state.websocket_manager
17
+
18
+
19
+ def get_agent_service(request: Request) -> AgentService:
20
+ """Dependency to get agent service from app state"""
21
+ return request.app.state.agent_service
22
+
23
+
24
+ @router.get("/health", response_model=HealthResponse)
25
+ async def health_check(
26
+ websocket_manager: WebSocketManager = Depends(get_websocket_manager),
27
+ ):
28
+ """Health check endpoint"""
29
+ return HealthResponse(
30
+ status="healthy",
31
+ timestamp=datetime.now(),
32
+ websocket_connections=websocket_manager.get_connection_count(),
33
+ )
34
+
35
+
36
+ @router.get("/tasks")
37
+ async def get_active_tasks(
38
+ agent_service: AgentService = Depends(get_agent_service),
39
+ websocket_manager: WebSocketManager = Depends(get_websocket_manager),
40
+ ):
41
+ """Get currently active tasks"""
42
+ return {
43
+ "active_tasks": agent_service.get_active_tasks(),
44
+ "total_connections": websocket_manager.get_connection_count(),
45
+ }
46
+
47
+
48
+ @router.get("/tasks/{task_id}")
49
+ async def get_task_status(
50
+ task_id: str, agent_service: AgentService = Depends(get_agent_service)
51
+ ):
52
+ """Get status of a specific task"""
53
+ task_status = agent_service.get_task_status(task_id)
54
+ if task_status is None:
55
+ raise HTTPException(status_code=404, detail="Task not found")
56
+ return {"task_id": task_id, "status": task_status}
cua2-core/src/cua2-core/routes/websocket.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from fastapi import APIRouter, WebSocket, WebSocketDisconnect
4
+
5
+ # Get services from app state
6
+ from backend.app import app
7
+ from backend.models.models import UserTaskMessage, WebSocketEvent
8
+
9
+ # Create router
10
+ router = APIRouter()
11
+
12
+
13
+ @router.websocket("/ws")
14
+ async def websocket_endpoint(websocket: WebSocket):
15
+ """WebSocket endpoint for real-time communication"""
16
+
17
+ websocket_manager = app.state.websocket_manager
18
+ agent_service = app.state.agent_service
19
+
20
+ await websocket_manager.connect(websocket)
21
+
22
+ try:
23
+ welcome_message = WebSocketEvent(
24
+ type="heartbeat",
25
+ content="WebSocket connection established successfully",
26
+ messageId="connection_welcome",
27
+ )
28
+ await websocket_manager.send_personal_message(welcome_message, websocket)
29
+
30
+ # Keep the connection alive and wait for messages
31
+ while True:
32
+ try:
33
+ # Wait for messages from client
34
+ data = await websocket.receive_text()
35
+
36
+ try:
37
+ # Parse the message
38
+ message_data = json.loads(data)
39
+ message = UserTaskMessage(**message_data)
40
+
41
+ # Process the user task
42
+ if message.type == "user_task":
43
+ message_id = await agent_service.process_user_task(
44
+ message.content, message.model_id
45
+ )
46
+
47
+ # Send acknowledgment back to the client
48
+ response = WebSocketEvent(
49
+ type="agent_start",
50
+ content=f"Received task: {message.content}",
51
+ messageId=message_id,
52
+ )
53
+ await websocket_manager.send_personal_message(
54
+ response, websocket
55
+ )
56
+
57
+ except json.JSONDecodeError:
58
+ error_response = WebSocketEvent(
59
+ type="agent_error", content="Invalid JSON format"
60
+ )
61
+ await websocket_manager.send_personal_message(
62
+ error_response, websocket
63
+ )
64
+
65
+ except Exception as e:
66
+ print(f"Error processing message: {e}")
67
+ error_response = WebSocketEvent(
68
+ type="agent_error",
69
+ content=f"Error processing message: {str(e)}",
70
+ )
71
+ await websocket_manager.send_personal_message(
72
+ error_response, websocket
73
+ )
74
+
75
+ except Exception as e:
76
+ print(f"Error receiving WebSocket message: {e}")
77
+ # If we can't receive messages, the connection is likely broken
78
+ break
79
+
80
+ except WebSocketDisconnect:
81
+ print("WebSocket disconnected normally")
82
+ except Exception as e:
83
+ print(f"WebSocket connection error: {e}")
84
+ finally:
85
+ # Ensure cleanup happens
86
+ websocket_manager.disconnect(websocket)
cua2-core/src/cua2-core/services/agent_service.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import uuid
3
+ from datetime import datetime
4
+ from typing import Optional
5
+
6
+ from smolagents import Model
7
+
8
+ from backend.models.models import ActiveTask, AgentMetadata
9
+ from backend.services.agents.get_agents import get_agent
10
+ from backend.services.models.get_model import get_model
11
+ from backend.websocket.websocket_manager import WebSocketManager
12
+ from computer_use_studio import Sandbox
13
+ from computer_use_studio.logger import get_logger
14
+
15
+ logger = get_logger(__name__)
16
+
17
+
18
+ class AgentService:
19
+ """Service for handling agent tasks and processing"""
20
+
21
+ def __init__(self, websocket_manager):
22
+ self.active_tasks: dict[str, ActiveTask] = {}
23
+ self.websocket_manager: WebSocketManager = websocket_manager
24
+
25
+ async def process_user_task(self, content: str, model_id: str) -> str:
26
+ """Process a user task and return the message ID"""
27
+
28
+ message_id = str(uuid.uuid4())
29
+ while message_id in self.active_tasks.keys():
30
+ message_id = str(uuid.uuid4())
31
+
32
+ # Store the task
33
+ self.active_tasks[message_id] = ActiveTask(
34
+ message_id=message_id,
35
+ content=content,
36
+ model_id=model_id,
37
+ start_time=datetime.now(),
38
+ status="processing",
39
+ )
40
+
41
+ # Determine the agent type based on the content of the task (TODO: implement agent type detection using LLM)
42
+ prompt_type = "FORM_SYSTEM_PROMPT"
43
+
44
+ # Start the agent processing in the background
45
+ asyncio.create_task(
46
+ self._simulate_agent_processing(content, model_id, message_id, prompt_type)
47
+ )
48
+
49
+ return message_id
50
+
51
+
52
+ # async def _simulate_agent_processing(self, message_id: str, content: str):
53
+ # """Simulate agent processing with progress updates"""
54
+ # try:
55
+ # # Send agent start event
56
+ # await self.websocket_manager.send_agent_start(
57
+ # content=f"Starting task: {content}", message_id=message_id
58
+ # )
59
+ #
60
+ # # Simulate processing steps
61
+ # steps = [
62
+ # "Analyzing task requirements...",
63
+ # "Planning execution steps...",
64
+ # "Initializing computer interface...",
65
+ # "Executing task commands...",
66
+ # "Verifying results...",
67
+ # "Finalizing task completion...",
68
+ # ]
69
+ #
70
+ # for i, step in enumerate(steps):
71
+ # await asyncio.sleep(2) # Simulate processing time
72
+ #
73
+ # # Send progress update
74
+ # await self.websocket_manager.send_agent_progress(
75
+ # content=f"{step} ({i + 1}/{len(steps)})", message_id=message_id
76
+ # )
77
+ #
78
+ # # Simulate VNC URL events during processing
79
+ # if i == 2: # After "Initializing computer interface..."
80
+ # # Set VNC URL when computer interface is ready
81
+ # vnc_url = "http://localhost:6080/vnc.html?host=localhost&port=5900&autoconnect=true"
82
+ # await self.websocket_manager.send_vnc_url_set(
83
+ # vnc_url=vnc_url,
84
+ # content="Computer interface ready, VNC stream connected",
85
+ # )
86
+ # elif i == 4: # After "Verifying results..."
87
+ # # Unset VNC URL when task is almost complete
88
+ # await self.websocket_manager.send_vnc_url_unset(
89
+ # content="Task verification complete, disconnecting VNC stream"
90
+ # )
91
+ #
92
+ # # Calculate metadata
93
+ # end_time = datetime.now()
94
+ # start_time = self.active_tasks[message_id]["start_time"]
95
+ # time_taken = (end_time - start_time).total_seconds()
96
+ #
97
+ # metadata = AgentMetadata(
98
+ # tokensUsed=150 + len(content) * 2, # Simulate token usage
99
+ # timeTaken=time_taken,
100
+ # numberOfSteps=len(steps),
101
+ # )
102
+ #
103
+ # # Send completion event
104
+ # await self.websocket_manager.send_agent_complete(
105
+ # content=f"Task completed successfully: {content}",
106
+ # message_id=message_id,
107
+ # metadata=metadata,
108
+ # )
109
+ #
110
+ # # Clean up
111
+ # if message_id in self.active_tasks:
112
+ # del self.active_tasks[message_id]
113
+ #
114
+ # except Exception as e:
115
+ # # Send error event
116
+ # await self.websocket_manager.send_agent_error(
117
+ # content=f"Error processing task: {str(e)}", message_id=message_id
118
+ # )
119
+ #
120
+ # # Clean up
121
+ # if message_id in self.active_tasks:
122
+ # del self.active_tasks[message_id]
123
+
124
+ def get_active_tasks(self) -> dict:
125
+ """Get currently active tasks"""
126
+ return self.active_tasks.copy()
127
+
128
+ def get_task_status(self, message_id: str) -> Optional[dict]:
129
+ """Get status of a specific task"""
130
+ return self.active_tasks.get(message_id)
cua2-core/src/cua2-core/services/agents/get_agents.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Annotated, TypeAlias
2
+
3
+ from pydantic import Field
4
+ from smolagents import Model
5
+
6
+ from backend.models.models import AgentType
7
+ from backend.services.agents.normalized_1000_agent import Normalized1000Agent
8
+ from backend.services.agents.normalized_agent import NormalizedAgent
9
+ from backend.services.agents.pixel_coordonates_agent import PixelCoordinatesAgent
10
+ from backend.services.agents.prompt import (
11
+ Normalized1000CoordinatesSystemPrompt,
12
+ NormalizedCoordinatesSystemPrompt,
13
+ PixelCoordinatesSystemPrompt,
14
+ )
15
+ from computer_use_studio import Sandbox
16
+
17
+ Agent: TypeAlias = Annotated[
18
+ PixelCoordinatesAgent | Normalized1000Agent | NormalizedAgent,
19
+ Field(discriminator="AGENT_TYPE"),
20
+ ]
21
+
22
+
23
+ def get_agent(
24
+ model: Model,
25
+ desktop: Sandbox,
26
+ agent_type: AgentType,
27
+ prompt_type: str,
28
+ data_dir: str,
29
+ **kwargs,
30
+ ) -> Agent:
31
+ """Get the agent by type"""
32
+ if agent_type == AgentType.PIXEL_COORDINATES:
33
+ return PixelCoordinatesAgent(
34
+ model=model,
35
+ desktop=desktop,
36
+ system_prompt=PixelCoordinatesSystemPrompt[prompt_type].value,
37
+ data_dir=data_dir,
38
+ **kwargs,
39
+ )
40
+ elif agent_type == AgentType.NORMALIZED_1000_COORDINATES:
41
+ return Normalized1000Agent(
42
+ model=model,
43
+ desktop=desktop,
44
+ system_prompt=Normalized1000CoordinatesSystemPrompt[prompt_type].value,
45
+ data_dir=data_dir,
46
+ **kwargs,
47
+ )
48
+ elif agent_type == AgentType.NORMALIZED_COORDINATES:
49
+ return Normalized1000Agent(
50
+ model=model,
51
+ desktop=desktop,
52
+ system_prompt=NormalizedCoordinatesSystemPrompt[prompt_type].value,
53
+ data_dir=data_dir,
54
+ **kwargs,
55
+ )
56
+ else:
57
+ raise ValueError(f"Invalid agent type: {agent_type}")
cua2-core/src/cua2-core/services/agents/normalized_1000_agent.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import unicodedata
3
+ from typing import List, Literal
4
+
5
+ # SmolaAgents imports
6
+ from smolagents import Model, Tool, tool
7
+ from smolagents.monitoring import LogLevel
8
+
9
+ from backend.models.models import AgentType
10
+ from backend.services.agents.prompt import Normalized1000CoordinatesSystemPrompt
11
+ from computer_use_studio import DesktopAgentBase, Sandbox
12
+
13
+
14
+ class Normalized1000Agent(DesktopAgentBase):
15
+ """Agent for desktop automation with normalized coordinates (0 to 1000)"""
16
+
17
+ AGENT_TYPE = AgentType.NORMALIZED_1000_COORDINATES
18
+
19
+ def __init__(
20
+ self,
21
+ model: Model,
22
+ data_dir: str,
23
+ desktop: Sandbox,
24
+ system_prompt: Normalized1000CoordinatesSystemPrompt,
25
+ tools: List[Tool] | None = None,
26
+ max_steps: int = 20,
27
+ verbosity_level: LogLevel = LogLevel.INFO,
28
+ planning_interval: int | None = None,
29
+ use_v1_prompt: bool = False,
30
+ **kwargs,
31
+ ):
32
+ super().__init__(
33
+ model=model,
34
+ data_dir=data_dir,
35
+ desktop=desktop,
36
+ system_prompt=system_prompt,
37
+ tools=tools,
38
+ max_steps=max_steps,
39
+ verbosity_level=verbosity_level,
40
+ planning_interval=planning_interval,
41
+ use_v1_prompt=use_v1_prompt,
42
+ **kwargs,
43
+ )
44
+
45
+ def _normalize_to_pixel(self, norm_x: int, norm_y: int) -> tuple[int, int]:
46
+ """
47
+ Convert normalized coordinates (0-1000) to pixel coordinates
48
+ Args:
49
+ norm_x: Normalized x coordinate (0 to 1000)
50
+ norm_y: Normalized y coordinate (0 to 1000)
51
+ Returns:
52
+ Tuple of (pixel_x, pixel_y)
53
+ """
54
+ # Clamp values to valid range
55
+ norm_x = max(0, min(1000, norm_x))
56
+ norm_y = max(0, min(1000, norm_y))
57
+
58
+ # Convert from 0-1000 range to 0-1 range, then to pixels
59
+ norm_x_float = norm_x / 1000.0
60
+ norm_y_float = norm_y / 1000.0
61
+
62
+ pixel_x = int(norm_x_float * self.width)
63
+ pixel_y = int(norm_y_float * self.height)
64
+
65
+ # Ensure we don't go outside screen bounds
66
+ pixel_x = max(0, min(self.width - 1, pixel_x))
67
+ pixel_y = max(0, min(self.height - 1, pixel_y))
68
+
69
+ return pixel_x, pixel_y
70
+
71
+ def _setup_desktop_tools(self):
72
+ """Register all desktop tools with normalized coordinate support (0-1000)"""
73
+
74
+ @tool
75
+ def click(x: int, y: int) -> str:
76
+ """
77
+ Performs a left-click at the specified normalized coordinates
78
+ Args:
79
+ x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
80
+ y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
81
+ """
82
+ pixel_x, pixel_y = self._normalize_to_pixel(x, y)
83
+ self.desktop.left_click(pixel_x, pixel_y)
84
+ self.click_coordinates = (pixel_x, pixel_y)
85
+ self.logger.log(
86
+ f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
87
+ )
88
+ time.sleep(1)
89
+ return f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
90
+
91
+ @tool
92
+ def right_click(x: int, y: int) -> str:
93
+ """
94
+ Performs a right-click at the specified normalized coordinates
95
+ Args:
96
+ x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
97
+ y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
98
+ """
99
+ pixel_x, pixel_y = self._normalize_to_pixel(x, y)
100
+ self.desktop.right_click(pixel_x, pixel_y)
101
+ self.click_coordinates = (pixel_x, pixel_y)
102
+ self.logger.log(
103
+ f"Right-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
104
+ )
105
+ return f"Right-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
106
+
107
+ @tool
108
+ def double_click(x: int, y: int) -> str:
109
+ """
110
+ Performs a double-click at the specified normalized coordinates
111
+ Args:
112
+ x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
113
+ y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
114
+ """
115
+ pixel_x, pixel_y = self._normalize_to_pixel(x, y)
116
+ self.desktop.double_click(pixel_x, pixel_y)
117
+ self.click_coordinates = (pixel_x, pixel_y)
118
+ self.logger.log(
119
+ f"Double-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
120
+ )
121
+ return f"Double-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
122
+
123
+ @tool
124
+ def move_mouse(x: int, y: int) -> str:
125
+ """
126
+ Moves the mouse cursor to the specified normalized coordinates
127
+ Args:
128
+ x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
129
+ y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
130
+ """
131
+ pixel_x, pixel_y = self._normalize_to_pixel(x, y)
132
+ self.desktop.move_mouse(pixel_x, pixel_y)
133
+ self.logger.log(
134
+ f"Moved mouse to normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
135
+ )
136
+ return f"Moved mouse to normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
137
+
138
+ def normalize_text(text):
139
+ return "".join(
140
+ c
141
+ for c in unicodedata.normalize("NFD", text)
142
+ if not unicodedata.combining(c)
143
+ )
144
+
145
+ @tool
146
+ def write(text: str) -> str:
147
+ """
148
+ Types the specified text at the current cursor position.
149
+ Args:
150
+ text: The text to type
151
+ """
152
+ # clean_text = normalize_text(text)
153
+ self.desktop.write(text, delay_in_ms=10)
154
+ self.logger.log(f"Typed text: '{text}'")
155
+ time.sleep(1)
156
+ return f"Typed text: '{text}'"
157
+
158
+ @tool
159
+ def press(key: str) -> str:
160
+ """
161
+ Presses a keyboard key or combination of keys
162
+ Args:
163
+ key: The key to press (e.g. "enter", "space", "backspace", etc.) or a multiple keys string to press, for example "ctrl+a" or "ctrl+shift+a".
164
+ """
165
+ self.desktop.press(key)
166
+ self.logger.log(f"Pressed key: {key}")
167
+ time.sleep(0.1)
168
+ return f"Pressed key: {key}"
169
+
170
+ @tool
171
+ def drag(x1: int, y1: int, x2: int, y2: int) -> str:
172
+ """
173
+ Clicks at normalized coordinates [x1, y1], drags mouse to [x2, y2], then release click.
174
+ Args:
175
+ x1: origin normalized x coordinate (0 to 1000)
176
+ y1: origin normalized y coordinate (0 to 1000)
177
+ x2: end normalized x coordinate (0 to 1000)
178
+ y2: end normalized y coordinate (0 to 1000)
179
+ """
180
+ pixel_x1, pixel_y1 = self._normalize_to_pixel(x1, y1)
181
+ pixel_x2, pixel_y2 = self._normalize_to_pixel(x2, y2)
182
+ self.desktop.drag((pixel_x1, pixel_y1), (pixel_x2, pixel_y2))
183
+ message = f"Dragged and dropped from normalized [{x1}, {y1}] to [{x2}, {y2}] -> pixels [{pixel_x1}, {pixel_y1}] to [{pixel_x2}, {pixel_y2}]"
184
+ self.logger.log(message)
185
+ return message
186
+
187
+ @tool
188
+ def scroll(
189
+ x: int,
190
+ y: int,
191
+ direction: Literal["up", "down"] = "down",
192
+ amount: int = 2,
193
+ ) -> str:
194
+ """
195
+ Moves the mouse to selected normalized coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
196
+ Args:
197
+ x: The normalized x coordinate (0 to 1000) of the element to scroll/zoom
198
+ y: The normalized y coordinate (0 to 1000) of the element to scroll/zoom
199
+ direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
200
+ amount: The amount to scroll. A good amount is 1 or 2.
201
+ """
202
+ pixel_x, pixel_y = self._normalize_to_pixel(x, y)
203
+ self.desktop.move_mouse(pixel_x, pixel_y)
204
+ self.desktop.scroll(direction=direction, amount=amount)
205
+ message = f"Scrolled {direction} by {amount} at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
206
+ self.logger.log(message)
207
+ return message
208
+
209
+ @tool
210
+ def wait(seconds: float) -> str:
211
+ """
212
+ Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
213
+ Args:
214
+ seconds: Number of seconds to wait, generally 3 is enough.
215
+ """
216
+ time.sleep(seconds)
217
+ self.logger.log(f"Waited for {seconds} seconds")
218
+ return f"Waited for {seconds} seconds"
219
+
220
+ @tool
221
+ def open(file_or_url: str) -> str:
222
+ """
223
+ Directly opens a browser with the specified url or opens a file with the default application: use this at start of web searches rather than trying to click the browser or open a file by clicking.
224
+ Args:
225
+ file_or_url: The URL or file to open
226
+ """
227
+
228
+ self.desktop.open(file_or_url)
229
+ # Give it time to load
230
+ time.sleep(2)
231
+ self.logger.log(f"Opening: {file_or_url}")
232
+ return f"Opened: {file_or_url}"
233
+
234
+ @tool
235
+ def launch_app(app_name: str) -> str:
236
+ """
237
+ Launches the specified application.
238
+ Args:
239
+ app_name: the name of the application to launch
240
+ """
241
+ self.desktop.launch(app_name)
242
+ self.logger.log(f"Launched app: {app_name}")
243
+ return f"Launched app: {app_name}"
244
+
245
+ @tool
246
+ def execute(command: str) -> str:
247
+ """
248
+ Executes a terminal command in the desktop environment.
249
+ Args:
250
+ command: The command to execute
251
+ """
252
+ self.desktop.execute_command(command)
253
+ self.logger.log(f"Executed command: {command}")
254
+ return f"Executed command: {command}"
255
+
256
+ @tool
257
+ def refresh() -> str:
258
+ """
259
+ Refreshes the current web page if you're in a browser.
260
+ """
261
+ self.desktop.press(["ctrl", "r"])
262
+ self.logger.log("Refreshed the current page")
263
+ return "Refreshed the current page"
264
+
265
+ @tool
266
+ def go_back() -> str:
267
+ """
268
+ Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
269
+ Args:
270
+ """
271
+ self.desktop.press(["alt", "left"])
272
+ self.logger.log("Went back one page")
273
+ return "Went back one page"
274
+
275
+ # Register the tools
276
+ self.tools["click"] = click
277
+ self.tools["right_click"] = right_click
278
+ self.tools["double_click"] = double_click
279
+ self.tools["move_mouse"] = move_mouse
280
+ self.tools["write"] = write
281
+ self.tools["press"] = press
282
+ self.tools["scroll"] = scroll
283
+ self.tools["wait"] = wait
284
+ self.tools["open"] = open
285
+ self.tools["go_back"] = go_back
286
+ self.tools["drag"] = drag
287
+ self.tools["launch_app"] = launch_app
288
+ self.tools["execute"] = execute
289
+ self.tools["refresh"] = refresh
290
+ self.tools["refresh"] = refresh
291
+ self.tools["execute"] = execute
292
+ self.tools["refresh"] = refresh
293
+ self.tools["refresh"] = refresh
cua2-core/src/cua2-core/services/agents/normalized_agent.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import unicodedata
3
+ from typing import List, Literal
4
+
5
+ # SmolaAgents imports
6
+ from smolagents import Model, Tool, tool
7
+ from smolagents.monitoring import LogLevel
8
+
9
+ from backend.models.models import AgentType
10
+ from backend.services.agents.prompt import NormalizedCoordinatesSystemPrompt
11
+ from computer_use_studio import DesktopAgentBase, Sandbox
12
+
13
+
14
+ class NormalizedAgent(DesktopAgentBase):
15
+ """Agent for desktop automation with normalized coordinates (0.0 to 1.0)"""
16
+
17
+ AGENT_TYPE = AgentType.NORMALIZED_COORDINATES
18
+
19
+ def __init__(
20
+ self,
21
+ model: Model,
22
+ data_dir: str,
23
+ desktop: Sandbox,
24
+ system_prompt: NormalizedCoordinatesSystemPrompt,
25
+ tools: List[Tool] | None = None,
26
+ max_steps: int = 20,
27
+ verbosity_level: LogLevel = LogLevel.INFO,
28
+ planning_interval: int | None = None,
29
+ use_v1_prompt: bool = False,
30
+ **kwargs,
31
+ ):
32
+ super().__init__(
33
+ model=model,
34
+ data_dir=data_dir,
35
+ desktop=desktop,
36
+ system_prompt=system_prompt,
37
+ tools=tools,
38
+ max_steps=max_steps,
39
+ verbosity_level=verbosity_level,
40
+ planning_interval=planning_interval,
41
+ use_v1_prompt=use_v1_prompt,
42
+ **kwargs,
43
+ )
44
+
45
+ def _normalize_to_pixel(self, norm_x: float, norm_y: float) -> tuple[int, int]:
46
+ """
47
+ Convert normalized coordinates (0.0-1.0) to pixel coordinates
48
+ Args:
49
+ norm_x: Normalized x coordinate (0.0 to 1.0)
50
+ norm_y: Normalized y coordinate (0.0 to 1.0)
51
+ Returns:
52
+ Tuple of (pixel_x, pixel_y)
53
+ """
54
+ # Clamp values to valid range
55
+ norm_x = max(0.0, min(1.0, norm_x))
56
+ norm_y = max(0.0, min(1.0, norm_y))
57
+
58
+ pixel_x = int(norm_x * self.width)
59
+ pixel_y = int(norm_y * self.height)
60
+
61
+ # Ensure we don't go outside screen bounds
62
+ pixel_x = max(0, min(self.width - 1, pixel_x))
63
+ pixel_y = max(0, min(self.height - 1, pixel_y))
64
+
65
+ return pixel_x, pixel_y
66
+
67
+ def _setup_desktop_tools(self):
68
+ """Register all desktop tools with normalized coordinate support"""
69
+
70
+ @tool
71
+ def click(x: float, y: float) -> str:
72
+ """
73
+ Performs a left-click at the specified normalized coordinates
74
+ Args:
75
+ x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
76
+ y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
77
+ """
78
+ pixel_x, pixel_y = self._normalize_to_pixel(x, y)
79
+ self.desktop.left_click(pixel_x, pixel_y)
80
+ self.click_coordinates = (pixel_x, pixel_y)
81
+ self.logger.log(
82
+ f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
83
+ )
84
+ return f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
85
+
86
+ @tool
87
+ def right_click(x: float, y: float) -> str:
88
+ """
89
+ Performs a right-click at the specified normalized coordinates
90
+ Args:
91
+ x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
92
+ y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
93
+ """
94
+ pixel_x, pixel_y = self._normalize_to_pixel(x, y)
95
+ self.desktop.right_click(pixel_x, pixel_y)
96
+ self.click_coordinates = (pixel_x, pixel_y)
97
+ self.logger.log(
98
+ f"Right-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
99
+ )
100
+ return f"Right-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
101
+
102
+ @tool
103
+ def double_click(x: float, y: float) -> str:
104
+ """
105
+ Performs a double-click at the specified normalized coordinates
106
+ Args:
107
+ x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
108
+ y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
109
+ """
110
+ pixel_x, pixel_y = self._normalize_to_pixel(x, y)
111
+ self.desktop.double_click(pixel_x, pixel_y)
112
+ self.click_coordinates = (pixel_x, pixel_y)
113
+ self.logger.log(
114
+ f"Double-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
115
+ )
116
+ return f"Double-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
117
+
118
+ @tool
119
+ def move_mouse(x: float, y: float) -> str:
120
+ """
121
+ Moves the mouse cursor to the specified normalized coordinates
122
+ Args:
123
+ x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
124
+ y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
125
+ """
126
+ pixel_x, pixel_y = self._normalize_to_pixel(x, y)
127
+ self.desktop.move_mouse(pixel_x, pixel_y)
128
+ self.logger.log(
129
+ f"Moved mouse to normalized coordinates ({pixel_x}, {pixel_y})"
130
+ )
131
+ return f"Moved mouse to normalized coordinates ({pixel_x}, {pixel_y})"
132
+
133
+ def normalize_text(text):
134
+ return "".join(
135
+ c
136
+ for c in unicodedata.normalize("NFD", text)
137
+ if not unicodedata.combining(c)
138
+ )
139
+
140
+ @tool
141
+ def write(text: str) -> str:
142
+ """
143
+ Types the specified text at the current cursor position.
144
+ Args:
145
+ text: The text to type
146
+ """
147
+ # clean_text = normalize_text(text)
148
+ self.desktop.write(text, delay_in_ms=10)
149
+ self.logger.log(f"Typed text: '{text}'")
150
+ return f"Typed text: '{text}'"
151
+
152
+ @tool
153
+ def press(key: str) -> str:
154
+ """
155
+ Presses a keyboard key or combination of keys
156
+ Args:
157
+ key: The key to press (e.g. "enter", "space", "backspace", etc.) or a multiple keys string to press, for example "ctrl+a" or "ctrl+shift+a".
158
+ """
159
+ self.desktop.press(key)
160
+ self.logger.log(f"Pressed key: {key}")
161
+ return f"Pressed key: {key}"
162
+
163
+ @tool
164
+ def drag(x1: float, y1: float, x2: float, y2: float) -> str:
165
+ """
166
+ Clicks at normalized coordinates [x1, y1], drags mouse to [x2, y2], then release click.
167
+ Args:
168
+ x1: origin normalized x coordinate (0.0 to 1.0)
169
+ y1: origin normalized y coordinate (0.0 to 1.0)
170
+ x2: end normalized x coordinate (0.0 to 1.0)
171
+ y2: end normalized y coordinate (0.0 to 1.0)
172
+ """
173
+ pixel_x1, pixel_y1 = self._normalize_to_pixel(x1, y1)
174
+ pixel_x2, pixel_y2 = self._normalize_to_pixel(x2, y2)
175
+ self.desktop.drag((pixel_x1, pixel_y1), (pixel_x2, pixel_y2))
176
+ message = f"Dragged and dropped from normalized [{pixel_x1}, {pixel_y1}] to [{pixel_x2}, {pixel_y2}]"
177
+ self.logger.log(message)
178
+ return message
179
+
180
+ @tool
181
+ def scroll(
182
+ x: float,
183
+ y: float,
184
+ direction: Literal["up", "down"] = "down",
185
+ amount: int = 2,
186
+ ) -> str:
187
+ """
188
+ Moves the mouse to selected normalized coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
189
+ Args:
190
+ x: The normalized x coordinate (0.0 to 1.0) of the element to scroll/zoom
191
+ y: The normalized y coordinate (0.0 to 1.0) of the element to scroll/zoom
192
+ direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
193
+ amount: The amount to scroll. A good amount is 1 or 2.
194
+ """
195
+ pixel_x, pixel_y = self._normalize_to_pixel(x, y)
196
+ self.desktop.move_mouse(pixel_x, pixel_y)
197
+ self.desktop.scroll(direction=direction, amount=amount)
198
+ message = f"Scrolled {direction} by {amount} at normalized coordinates ({pixel_x}, {pixel_y})"
199
+ self.logger.log(message)
200
+ return message
201
+
202
+ @tool
203
+ def wait(seconds: float) -> str:
204
+ """
205
+ Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
206
+ Args:
207
+ seconds: Number of seconds to wait, generally 3 is enough.
208
+ """
209
+ time.sleep(seconds)
210
+ self.logger.log(f"Waited for {seconds} seconds")
211
+ return f"Waited for {seconds} seconds"
212
+
213
+ @tool
214
+ def open(file_or_url: str) -> str:
215
+ """
216
+ Directly opens a browser with the specified url or opens a file with the default application: use this at start of web searches rather than trying to click the browser or open a file by clicking.
217
+ Args:
218
+ file_or_url: The URL or file to open
219
+ """
220
+
221
+ self.desktop.open(file_or_url)
222
+ # Give it time to load
223
+ time.sleep(2)
224
+ self.logger.log(f"Opening: {file_or_url}")
225
+ return f"Opened: {file_or_url}"
226
+
227
+ @tool
228
+ def launch_app(app_name: str) -> str:
229
+ """
230
+ Launches the specified application.
231
+ Args:
232
+ app_name: the name of the application to launch
233
+ """
234
+ self.desktop.launch(app_name)
235
+ self.logger.log(f"Launched app: {app_name}")
236
+ return f"Launched app: {app_name}"
237
+
238
+ @tool
239
+ def execute(command: str) -> str:
240
+ """
241
+ Executes a terminal command in the desktop environment.
242
+ Args:
243
+ command: The command to execute
244
+ """
245
+ self.desktop.execute_command(command)
246
+ self.logger.log(f"Executed command: {command}")
247
+ return f"Executed command: {command}"
248
+
249
+ @tool
250
+ def refresh() -> str:
251
+ """
252
+ Refreshes the current web page if you're in a browser.
253
+ """
254
+ self.desktop.press(["ctrl", "r"])
255
+ self.logger.log("Refreshed the current page")
256
+ return "Refreshed the current page"
257
+
258
+ @tool
259
+ def go_back() -> str:
260
+ """
261
+ Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
262
+ Args:
263
+ """
264
+ self.desktop.press(["alt", "left"])
265
+ self.logger.log("Went back one page")
266
+ return "Went back one page"
267
+
268
+ # Register the tools
269
+ self.tools["click"] = click
270
+ self.tools["right_click"] = right_click
271
+ self.tools["double_click"] = double_click
272
+ self.tools["move_mouse"] = move_mouse
273
+ self.tools["write"] = write
274
+ self.tools["press"] = press
275
+ self.tools["scroll"] = scroll
276
+ self.tools["wait"] = wait
277
+ self.tools["open"] = open
278
+ self.tools["go_back"] = go_back
279
+ self.tools["drag"] = drag
280
+ self.tools["launch_app"] = launch_app
281
+ self.tools["execute"] = execute
282
+ self.tools["refresh"] = refresh
cua2-core/src/cua2-core/services/agents/pixel_coordonates_agent.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import unicodedata
3
+ from typing import List, Literal
4
+
5
+ # SmolaAgents imports
6
+ from smolagents import Model, Tool, tool
7
+ from smolagents.monitoring import LogLevel
8
+
9
+ from backend.models.models import AgentType
10
+ from backend.services.agents.prompt import PixelCoordinatesSystemPrompt
11
+ from computer_use_studio import DesktopAgentBase, Sandbox
12
+
13
+
14
+ class PixelCoordinatesAgent(DesktopAgentBase):
15
+ """Agent for desktop automation"""
16
+
17
+ AGENT_TYPE = AgentType.PIXEL_COORDINATES
18
+
19
+ def __init__(
20
+ self,
21
+ model: Model,
22
+ data_dir: str,
23
+ desktop: Sandbox,
24
+ system_prompt: PixelCoordinatesSystemPrompt,
25
+ tools: List[Tool] | None = None,
26
+ max_steps: int = 20,
27
+ verbosity_level: LogLevel = LogLevel.INFO,
28
+ planning_interval: int | None = None,
29
+ use_v1_prompt: bool = False,
30
+ **kwargs,
31
+ ):
32
+ super().__init__(
33
+ model=model,
34
+ data_dir=data_dir,
35
+ desktop=desktop,
36
+ system_prompt=system_prompt,
37
+ tools=tools,
38
+ max_steps=max_steps,
39
+ verbosity_level=verbosity_level,
40
+ planning_interval=planning_interval,
41
+ use_v1_prompt=use_v1_prompt,
42
+ **kwargs,
43
+ )
44
+
45
+ # OPTIONAL: Add a custom prompt template - see src/computer_use_studio/desktop_agent/desktop_agent_base.py for more details about the default prompt template
46
+ # self.prompt_templates["system_prompt"] = CUSTOM_PROMPT_TEMPLATE.replace(
47
+ # "<<resolution_x>>", str(self.width)
48
+ # ).replace("<<resolution_y>>", str(self.height))
49
+ # Important: Change the prompt to get better results, depending on your action space.
50
+
51
+ def _setup_desktop_tools(self):
52
+ """Register all desktop tools"""
53
+
54
+ @tool
55
+ def click(x: int, y: int) -> str:
56
+ """
57
+ Performs a left-click at the specified coordinates
58
+ Args:
59
+ x: The x coordinate (horizontal position)
60
+ y: The y coordinate (vertical position)
61
+ """
62
+ self.desktop.left_click(x, y)
63
+ self.click_coordinates = (x, y)
64
+ self.logger.log(f"Clicked at coordinates ({x}, {y})")
65
+ return f"Clicked at coordinates ({x}, {y})"
66
+
67
+ @tool
68
+ def right_click(x: int, y: int) -> str:
69
+ """
70
+ Performs a right-click at the specified coordinates
71
+ Args:
72
+ x: The x coordinate (horizontal position)
73
+ y: The y coordinate (vertical position)
74
+ """
75
+ self.desktop.right_click(x, y)
76
+ self.click_coordinates = (x, y)
77
+ self.logger.log(f"Right-clicked at coordinates ({x}, {y})")
78
+ return f"Right-clicked at coordinates ({x}, {y})"
79
+
80
+ @tool
81
+ def double_click(x: int, y: int) -> str:
82
+ """
83
+ Performs a double-click at the specified coordinates
84
+ Args:
85
+ x: The x coordinate (horizontal position)
86
+ y: The y coordinate (vertical position)
87
+ """
88
+ self.desktop.double_click(x, y)
89
+ self.click_coordinates = (x, y)
90
+ self.logger.log(f"Double-clicked at coordinates ({x}, {y})")
91
+ return f"Double-clicked at coordinates ({x}, {y})"
92
+
93
+ @tool
94
+ def move_mouse(x: int, y: int) -> str:
95
+ """
96
+ Moves the mouse cursor to the specified coordinates
97
+ Args:
98
+ x: The x coordinate (horizontal position)
99
+ y: The y coordinate (vertical position)
100
+ """
101
+ self.desktop.move_mouse(x, y)
102
+ self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
103
+ return f"Moved mouse to coordinates ({x}, {y})"
104
+
105
+ def normalize_text(text):
106
+ return "".join(
107
+ c
108
+ for c in unicodedata.normalize("NFD", text)
109
+ if not unicodedata.combining(c)
110
+ )
111
+
112
+ @tool
113
+ def write(text: str) -> str:
114
+ """
115
+ Types the specified text at the current cursor position.
116
+ Args:
117
+ text: The text to type
118
+ """
119
+ # clean_text = normalize_text(text)
120
+ self.desktop.write(text, delay_in_ms=10)
121
+ self.logger.log(f"Typed text: '{text}'")
122
+ return f"Typed text: '{text}'"
123
+
124
+ @tool
125
+ def press(key: str) -> str:
126
+ """
127
+ Presses a keyboard key or combination of keys
128
+ Args:
129
+ key: The key to press (e.g. "enter", "space", "backspace", etc.) or a multiple keys string to press, for example "ctrl+a" or "ctrl+shift+a".
130
+ """
131
+ self.desktop.press(key)
132
+ self.logger.log(f"Pressed key: {key}")
133
+ return f"Pressed key: {key}"
134
+
135
+ @tool
136
+ def drag(x1: int, y1: int, x2: int, y2: int) -> str:
137
+ """
138
+ Clicks [x1, y1], drags mouse to [x2, y2], then release click.
139
+ Args:
140
+ x1: origin x coordinate
141
+ y1: origin y coordinate
142
+ x2: end x coordinate
143
+ y2: end y coordinate
144
+ """
145
+ self.desktop.drag((x1, y1), (x2, y2))
146
+ message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
147
+ self.logger.log(message)
148
+ return message
149
+
150
+ @tool
151
+ def scroll(
152
+ x: int, y: int, direction: Literal["up", "down"] = "down", amount: int = 2
153
+ ) -> str:
154
+ """
155
+ Moves the mouse to selected coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
156
+ Args:
157
+ x: The x coordinate (horizontal position) of the element to scroll/zoom
158
+ y: The y coordinate (vertical position) of the element to scroll/zoom
159
+ direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
160
+ amount: The amount to scroll. A good amount is 1 or 2.
161
+ """
162
+ self.desktop.move_mouse(x, y)
163
+ self.desktop.scroll(direction=direction, amount=amount)
164
+ message = f"Scrolled {direction} by {amount}"
165
+ self.logger.log(message)
166
+ return message
167
+
168
+ @tool
169
+ def wait(seconds: float) -> str:
170
+ """
171
+ Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
172
+ Args:
173
+ seconds: Number of seconds to wait, generally 3 is enough.
174
+ """
175
+ time.sleep(seconds)
176
+ self.logger.log(f"Waited for {seconds} seconds")
177
+ return f"Waited for {seconds} seconds"
178
+
179
+ @tool
180
+ def open(file_or_url: str) -> str:
181
+ """
182
+ Directly opens a browser with the specified url or opens a file with the default application: use this at start of web searches rather than trying to click the browser or open a file by clicking.
183
+ Args:
184
+ file_or_url: The URL or file to open
185
+ """
186
+
187
+ self.desktop.open(file_or_url)
188
+ # Give it time to load
189
+ time.sleep(2)
190
+ self.logger.log(f"Opening: {file_or_url}")
191
+ return f"Opened: {file_or_url}"
192
+
193
+ @tool
194
+ def launch_app(app_name: str) -> str:
195
+ """
196
+ Launches the specified application.
197
+ Args:
198
+ app_name: the name of the application to launch
199
+ """
200
+ self.desktop.launch(app_name)
201
+ self.logger.log(f"Launched app: {app_name}")
202
+ return f"Launched app: {app_name}"
203
+
204
+ @tool
205
+ def execute(command: str) -> str:
206
+ """
207
+ Executes a terminal command in the desktop environment.
208
+ Args:
209
+ command: The command to execute
210
+ """
211
+ self.desktop.execute_command(command)
212
+ self.logger.log(f"Executed command: {command}")
213
+ return f"Executed command: {command}"
214
+
215
+ @tool
216
+ def refresh() -> str:
217
+ """
218
+ Refreshes the current web page if you're in a browser.
219
+ """
220
+ self.desktop.press(["ctrl", "r"])
221
+ self.logger.log("Refreshed the current page")
222
+ return "Refreshed the current page"
223
+
224
+ @tool
225
+ def go_back() -> str:
226
+ """
227
+ Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
228
+ Args:
229
+ """
230
+ self.desktop.press(["alt", "left"])
231
+ self.logger.log("Went back one page")
232
+ return "Went back one page"
233
+
234
+ # Register the tools
235
+ self.tools["click"] = click
236
+ self.tools["right_click"] = right_click
237
+ self.tools["double_click"] = double_click
238
+ self.tools["move_mouse"] = move_mouse
239
+ self.tools["write"] = write
240
+ self.tools["press"] = press
241
+ self.tools["scroll"] = scroll
242
+ self.tools["wait"] = wait
243
+ self.tools["open"] = open
244
+ self.tools["go_back"] = go_back
245
+ self.tools["drag"] = drag
246
+ self.tools["launch_app"] = launch_app
247
+ self.tools["execute"] = execute
248
+ self.tools["refresh"] = refresh
249
+
250
+
251
+ if __name__ == "__main__":
252
+ # ================================
253
+ # MODEL CONFIGURATION
254
+ # ================================
255
+
256
+ # import os
257
+
258
+ # from smolagents import OpenAIServerModel
259
+
260
+ # model = OpenAIServerModel(
261
+ # model_id="gpt-4.1",
262
+ # api_key=os.getenv("OPENAI_API_KEY"),
263
+ # )
264
+
265
+ # For Inference Endpoints
266
+ # from smolagents import HfApiModel
267
+ # model = HfApiModel(
268
+ # model_id="Qwen/Qwen2.5-VL-72B-Instruct",
269
+ # token=os.getenv("HF_TOKEN"),
270
+ # provider="nebius",
271
+ # )
272
+
273
+ # For Transformer models
274
+ # from smolagents import TransformersModel
275
+ # model = TransformersModel(
276
+ # model_id="Qwen/Qwen2.5-VL-72B-Instruct",
277
+ # device_map="auto",
278
+ # torch_dtype="auto",
279
+ # trust_remote_code=True,
280
+ # )
281
+
282
+ # For other providers
283
+ from smolagents import LiteLLMModel
284
+
285
+ model = LiteLLMModel(model_id="anthropic/claude-sonnet-4-5-20250929")
286
+ # model = LiteLLMModel(model_id="gemini/gemini-2.5-flash")
287
+
288
+ # ================================
289
+ # RUN AGENT
290
+ # ================================
291
+
292
+ # Interactive task input loop
293
+ sandbox = None
294
+ agent = None
295
+ while True:
296
+ try:
297
+ task = get_user_input()
298
+ if task is None:
299
+ exit()
300
+ sandbox = Sandbox(headless=False, resolution=(1024, 1024))
301
+ sandbox.start_recording()
302
+ agent = FormAgent(model=model, data_dir="data", desktop=sandbox)
303
+
304
+ print("\n🤖 Agent is working on your task...")
305
+ print("-" * 60)
306
+ result = agent.run(task)
307
+ print("\n✅ Task completed successfully!")
308
+ print(f"📄 Result: {result}")
309
+ except Exception as e:
310
+ print(f"\n❌ Error occurred: {str(e)}")
311
+ finally:
312
+ if sandbox:
313
+ sandbox.end_recording("recording.mp4")
314
+ if agent:
315
+ agent.close()
316
+
317
+ print("\n" + "=" * 60)
cua2-core/src/cua2-core/services/agents/prompt.py ADDED
@@ -0,0 +1,548 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+
3
+
4
+ class PixelCoordinatesSystemPrompt(Enum):
5
+ """Pixel coordinates system prompt"""
6
+
7
+ FORM_SYSTEM_PROMPT = """You are a web form automation assistant that can control a remote desktop environment with a web browser open.
8
+ The current date is <<current_date>>.
9
+
10
+ <action_process>
11
+ You will be given a task to complete in several steps (e.g. filling forms, signing up, logging in, submitting claims).
12
+ At each step you will perform **one action**.
13
+ After each action, you will receive an updated screenshot.
14
+ Then you will proceed as follows, with these sections — do not skip any:
15
+
16
+ Short term goal: ...
17
+ What I see: ...
18
+ Reflection: ...
19
+ Action:
20
+ ```python
21
+ tool_name(arguments)
22
+ ```<end_code>
23
+
24
+ Always format your Action section as **Python code blocks** exactly as shown above.
25
+ </action_process>
26
+
27
+ <tools>
28
+ On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
29
+ {%- for tool in tools.values() %}
30
+ - {{ tool.name }}: {{ tool.description }}
31
+ Takes inputs: {{tool.inputs}}
32
+ Returns an output of type: {{tool.output_type}}
33
+ {%- endfor %}
34
+ </tools>
35
+
36
+ <web_form_guidelines>
37
+ Look at the elements on the screen (input fields, checkboxes, buttons, dropdowns) to decide where to interact.
38
+ The desktop has a resolution of <<resolution_x>>x<<resolution_y>> pixels — use that to decide mouse coordinates.
39
+ **Never use hypothetical or assumed coordinates; always use real coordinates visible on the screenshot.**
40
+
41
+ ### Typical Web Form Interactions
42
+ - **Input fields**: click in the field first to focus it, then use `write("text")`.
43
+ - **Passwords**: type them just like text — `write("password123")`.
44
+ - **Checkboxes / radio buttons**: use `click(x,y)` directly on the box/circle.
45
+ - **Dropdown menus**: click to open, then click the option or use arrow keys + press("enter").
46
+ - **Submit buttons**: identify clearly labelled “Sign up”, “Sign in”, “Submit” buttons and click at their coordinates.
47
+ - **Captcha or 2FA**: wait or prompt for external handling (do not bypass security).
48
+ - **Pop-ups**: try `press("enter")` to confirm or `press("escape")` to close if they block your action.
49
+
50
+ ### Grouping Multiple Inputs
51
+ - If multiple fields, checkboxes, or similar controls are clearly visible **and** can be filled/clicked in sequence without ambiguity, you may include **several actions in one code block**.
52
+ - Keep each Action still in a **Python code block** but with multiple tool calls inside, for example:
53
+ ```python
54
+ click(450, 320) # Email field
55
+ wait(0.1)
56
+ write("[email protected]")
57
+ click(450, 380) # Password field
58
+ wait(0.1)
59
+ write("mypassword123")
60
+ click(430, 600) # Checkbox “Accept terms”
61
+ wait(0.1)
62
+ ```<end_code>
63
+ - Only group actions when:
64
+ 1. They’re all part of the **same form or step**,
65
+ 2. The screenshot clearly shows all elements and coordinates,
66
+ 3. The order of operations is obvious.
67
+ - Otherwise, default back to one Action per step.
68
+
69
+ ### Precision
70
+ - Always **click before typing** to ensure the right field is active.
71
+ - Always **scroll if needed** to bring elements into view before clicking.
72
+ - Always **validate each action** via the screenshot before continuing.
73
+
74
+ </web_form_guidelines>
75
+
76
+ <task_resolution_example>
77
+ For a task like “Sign up for an account and submit the form”:
78
+
79
+ Step 1:
80
+ Short term goal: I want to open the signup page.
81
+ What I see: The browser is open on the homepage.
82
+ Reflection: I will open the signup URL directly.
83
+ Action:
84
+ ```python
85
+ open("https://example.com/signup")
86
+ wait(3)
87
+ ```<end_code>
88
+
89
+ Step 2:
90
+ Short term goal: I want to fill the “Email” field.
91
+ What I see: I see the signup form with an “Email” field at (450, 320).
92
+ Reflection: I will click inside the field then type my email.
93
+ Action:
94
+ ```python
95
+ click(450, 320)
96
+ write("[email protected]")
97
+ ```<end_code>
98
+
99
+ Step 3:
100
+ Short term goal: I want to check the “I accept terms” checkbox.
101
+ What I see: The checkbox is at (430, 600).
102
+ Reflection: I will click it.
103
+ Action:
104
+ ```python
105
+ click(430, 600)
106
+ ```<end_code>
107
+
108
+ Step 4:
109
+ Short term goal: I want to submit the form.
110
+ What I see: The “Sign Up” button at (500, 700).
111
+ Reflection: I will click the button to submit.
112
+ Action:
113
+ ```python
114
+ click(500, 700)
115
+ wait(3)
116
+ ```<end_code>
117
+
118
+ Step 5:
119
+ Short term goal: Verify signup completed.
120
+ What I see: A confirmation page “Welcome [email protected]”.
121
+ Reflection: Task complete.
122
+ Action:
123
+ ```python
124
+ final_answer("Signup completed")
125
+ ```<end_code>
126
+ </task_resolution_example>
127
+
128
+ <general_guidelines>
129
+ # GUI Agent Guidelines for Web Forms
130
+
131
+ ## Environment Overview
132
+ Ubuntu 22.04 XFCE4 desktop with Google Chrome/Chromium browser.
133
+ Agent can fill forms, sign up, sign in, click checkboxes, submit claims.
134
+
135
+ ## Core Principles
136
+
137
+ ### 1. Screenshot Analysis
138
+ - Always analyze the latest screenshot carefully before each action.
139
+ - Validate that previous actions worked by examining the current state.
140
+ - If an action didn’t work, try an alternative rather than repeating blindly.
141
+
142
+ ### 2. Action Execution
143
+ - Execute one action or multiple actions at a time (grouped in one code block).
144
+ - Wait for appropriate loading times using `wait()` but not indefinitely.
145
+ - Scroll to bring hidden elements into view.
146
+
147
+ ### 3. Keyboard Shortcuts
148
+ - Use `tab` to move between fields, `space` to toggle checkboxes, `enter` to submit forms.
149
+ - Copy/paste: `ctrl+C`, `ctrl+V`.
150
+ - Refresh page: `refresh()`.
151
+
152
+ ### 4. Error Recovery
153
+ - If clicking doesn’t work, try double_click or right_click.
154
+ - If typing doesn’t appear, ensure the field is focused with click.
155
+ - If popups block the screen, try `press("enter")` or `press("escape")`.
156
+
157
+ ### 5. Security & Privacy
158
+ - Don’t attempt to bypass captchas or 2FA automatically.
159
+ - Don’t store credentials in plain text unless instructed.
160
+
161
+ ### 6. Final Answer
162
+ - When the form is successfully submitted or the goal achieved, use:
163
+ ```python
164
+ final_answer("Done")
165
+ ```<end_code>
166
+ </general_guidelines>
167
+ """
168
+
169
+
170
+ class Normalized1000CoordinatesSystemPrompt(Enum):
171
+ """Normalized 1000 coordinates system prompt"""
172
+
173
+ FORM_SYSTEM_PROMPT = """You are a web form automation assistant that can control a remote desktop environment with a web browser open.
174
+ The current date is <<current_date>>.
175
+
176
+ <action_process>
177
+ You will be given a task to complete in several steps (e.g. filling forms, signing up, logging in, submitting claims).
178
+ At each step you will perform **one action**.
179
+ After each action, you will receive an updated screenshot.
180
+ Then you will proceed as follows, with these sections — do not skip any:
181
+
182
+ Short term goal: ...
183
+ What I see: ...
184
+ Reflection: ...
185
+ Action:
186
+ ```python
187
+ tool_name(arguments)
188
+ ```<end_code>
189
+
190
+ Always format your Action section as **Python code blocks** exactly as shown above.
191
+ </action_process>
192
+
193
+ <tools>
194
+ On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
195
+ {%- for tool in tools.values() %}
196
+ - {{ tool.name }}: {{ tool.description }}
197
+ Takes inputs: {{tool.inputs}}
198
+ Returns an output of type: {{tool.output_type}}
199
+ {%- endfor %}
200
+ </tools>
201
+
202
+ <coordinate_system>
203
+ **IMPORTANT: This system uses NORMALIZED COORDINATES (0 to 1000)**
204
+
205
+ You must use normalized coordinates:
206
+ - **x-coordinate**: 0 = left edge, 1000 = right edge of screen
207
+ - **y-coordinate**: 0 = top edge, 1000 = bottom edge of screen
208
+ - **Example**: Center of screen is (500, 500)
209
+ - **Example**: Top-left corner is (0, 0)
210
+ - **Example**: Bottom-right corner is (1000, 1000)
211
+
212
+ When you see an element on the screenshot:
213
+ 1. Estimate its position relative to the screen dimensions
214
+ 2. Convert to normalized coordinates between 0 and 1000
215
+ 3. Use these normalized coordinates in your tool calls
216
+
217
+ **Never use pixel coordinates directly - always use normalized coordinates between 0 and 1000**
218
+ </coordinate_system>
219
+
220
+ <web_form_guidelines>
221
+ Look at the elements on the screen (input fields, checkboxes, buttons, dropdowns) to decide where to interact.
222
+ **Always use normalized coordinates (0 to 1000) based on the element's relative position on the screen.**
223
+
224
+ ### Typical Web Form Interactions
225
+ - **Input fields**: click in the field first to focus it, then use `write("text")`.
226
+ - **Passwords**: type them just like text — `write("password123")`.
227
+ - **Checkboxes / radio buttons**: use `click(x,y)` directly on the box/circle. Click on the box/circle itself at the left side of the text, not on the text label.
228
+ - **Dropdown menus**: click to open, then click the option or use arrow keys + press("enter").
229
+ - **Submit buttons**: identify clearly labelled "Sign up", "Sign in", "Submit" buttons and click at their normalized coordinates.
230
+ - **Captcha or 2FA**: wait or prompt for external handling (do not bypass security).
231
+ - **Pop-ups**: try `press("enter")` to confirm or `press("escape")` to close if they block your action.
232
+
233
+ ### Grouping Multiple Inputs
234
+ - If multiple fields, checkboxes, or similar controls are clearly visible **and** can be filled/clicked in sequence without ambiguity, you may include **several actions in one code block**.
235
+ - Keep each Action still in a **Python code block** but with multiple tool calls inside, for example:
236
+ ```python
237
+ click(470, 300) # Email field (normalized coordinates)
238
+ write("[email protected]")
239
+ click(470, 350) # Password field (normalized coordinates)
240
+ write("mypassword123")
241
+ click(450, 550) # Checkbox left side of the text "Accept terms" (normalized coordinates)
242
+ ```<end_code>
243
+
244
+ - Only group actions when:
245
+ 1. They're all part of the **same form or step**,
246
+ 2. The screenshot clearly shows all elements and coordinates,
247
+ 3. The order of operations is obvious.
248
+ - Otherwise, default back to one Action per step.
249
+
250
+ ### Precision
251
+ - Always **click before typing** to ensure the right field is active.
252
+ - Always **scroll if needed** to bring elements into view before clicking.
253
+ - Always **validate each action** via the screenshot before continuing.
254
+ - Always use **normalized coordinates between 0 and 1000**.
255
+ </web_form_guidelines>
256
+
257
+ <task_resolution_example>
258
+ For a task like "Sign up for an account and submit the form":
259
+
260
+ Step 1:
261
+ Short term goal: I want to open the signup page.
262
+ What I see: The browser is open on the homepage.
263
+ Reflection: I will open the signup URL directly.
264
+ Action:
265
+ ```python
266
+ open("https://example.com/signup")
267
+ wait(3)
268
+ ```<end_code>
269
+
270
+ Step 2:
271
+ Short term goal: I want to fill the form fields that are currently visible.
272
+ What I see: I see the signup form with "Email" and "Password" fields, plus a checkbox for accepting terms.
273
+ Reflection: I will fill all the visible form fields in sequence - click the email field and type the email, then click the password field and type the password, then click the checkbox to accept terms.
274
+ Action:
275
+ ```python
276
+ click(470, 300) # Email field (normalized coordinates)
277
+ write("[email protected]")
278
+ click(470, 350) # Password field (normalized coordinates)
279
+ write("mypassword123")
280
+ click(450, 550) # Checkbox left side of the text "Accept terms" (normalized coordinates)
281
+ ```<end_code>
282
+
283
+ Step 3:
284
+ Short term goal: I need to scroll down to see the "Sign Up" button.
285
+ What I see: The form fields are filled, but I cannot see the "Sign Up" button - it's likely below the current view.
286
+ Reflection: I will scroll down to bring the submit button into view so I can click it in the next step.
287
+ Action:
288
+ ```python
289
+ scroll(500, 500, "down", 3)
290
+ ```<end_code>
291
+
292
+ Step 4:
293
+ Short term goal: I want to submit the form.
294
+ What I see: The "Sign Up" button is at the bottom center, around 520, 650 in normalized coordinates.
295
+ Reflection: I will click the button to submit.
296
+ Action:
297
+ ```python
298
+ click(520, 650)
299
+ wait(3)
300
+ ```<end_code>
301
+
302
+ Step 5:
303
+ Short term goal: Verify signup completed.
304
+ What I see: A confirmation page "Welcome [email protected]".
305
+ Reflection: Task complete.
306
+ Action:
307
+ ```python
308
+ final_answer("Signup completed")
309
+ ```<end_code>
310
+ </task_resolution_example>
311
+
312
+ <general_guidelines>
313
+ # GUI Agent Guidelines for Web Forms (0-1000 Coordinates)
314
+
315
+ ## Environment Overview
316
+ Ubuntu 22.04 XFCE4 desktop with Google Chrome/Chromium browser.
317
+ Agent can fill forms, sign up, sign in, click checkboxes, submit claims.
318
+ **All coordinates are normalized between 0 and 1000.**
319
+
320
+ ## Core Principles
321
+
322
+ ### 1. Screenshot Analysis
323
+ - Always analyze the latest screenshot carefully before each action.
324
+ - Validate that previous actions worked by examining the current state.
325
+ - If an action didn't work, try an alternative rather than repeating blindly.
326
+
327
+ ### 2. Action Execution
328
+ - Execute one or multiple actions at a time (grouped in one code block).
329
+ - Wait for appropriate loading times using `wait()` but not indefinitely.
330
+ - Scroll to bring hidden elements into view.
331
+
332
+ ### 3. Coordinate System
333
+ - **CRITICAL**: Always use normalized coordinates (0 to 1000)
334
+ - Convert visual position on screen to normalized coordinates
335
+ - Center of screen = (500, 500)
336
+ - Top-left = (0, 0), Bottom-right = (1000, 1000)
337
+
338
+ ### 4. Keyboard Shortcuts
339
+ - Use `tab` to move between fields, `space` to toggle checkboxes, `enter` to submit forms.
340
+ - Copy/paste: `ctrl+C`, `ctrl+V`.
341
+ - Refresh page: `refresh()`.
342
+
343
+ ### 5. Error Recovery
344
+ - If clicking doesn't work, try double_click or right_click.
345
+ - If typing doesn't appear, ensure the field is focused with click.
346
+ - If popups block the screen, try `press("enter")` or `press("escape")`.
347
+
348
+ ### 6. Security & Privacy
349
+ - Don't attempt to bypass captchas or 2FA automatically.
350
+ - Don't store credentials in plain text unless instructed.
351
+
352
+ ### 7. Final Answer
353
+ - When the form is successfully submitted or the goal achieved, use:
354
+ ```python
355
+ final_answer("Done")
356
+ ```<end_code>
357
+ </general_guidelines>
358
+ """
359
+
360
+
361
+ class NormalizedCoordinatesSystemPrompt(Enum):
362
+ """Normalized coordinates system prompt"""
363
+
364
+ FORM_SYSTEM_PROMPT = """You are a web form automation assistant that can control a remote desktop environment with a web browser open.
365
+ The current date is <<current_date>>.
366
+
367
+ <action_process>
368
+ You will be given a task to complete in several steps (e.g. filling forms, signing up, logging in, submitting claims).
369
+ At each step you will perform **one action**.
370
+ After each action, you will receive an updated screenshot.
371
+ Then you will proceed as follows, with these sections — do not skip any:
372
+
373
+ Short term goal: ...
374
+ What I see: ...
375
+ Reflection: ...
376
+ Action:
377
+ ```python
378
+ tool_name(arguments)
379
+ ```<end_code>
380
+
381
+ Always format your Action section as **Python code blocks** exactly as shown above.
382
+ </action_process>
383
+
384
+ <tools>
385
+ On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
386
+ {%- for tool in tools.values() %}
387
+ - {{ tool.name }}: {{ tool.description }}
388
+ Takes inputs: {{tool.inputs}}
389
+ Returns an output of type: {{tool.output_type}}
390
+ {%- endfor %}
391
+ </tools>
392
+
393
+ <coordinate_system>
394
+ **IMPORTANT: This system uses NORMALIZED COORDINATES (0.0 to 1.0)**
395
+
396
+ You must use normalized coordinates:
397
+ - **x-coordinate**: 0.0 = left edge, 1.0 = right edge of screen
398
+ - **y-coordinate**: 0.0 = top edge, 1.0 = bottom edge of screen
399
+ - **Example**: Center of screen is (0.5, 0.5)
400
+ - **Example**: Top-left corner is (0.0, 0.0)
401
+ - **Example**: Bottom-right corner is (1.0, 1.0)
402
+
403
+ When you see an element on the screenshot:
404
+ 1. Estimate its position relative to the screen dimensions
405
+ 2. Convert to normalized coordinates between 0.0 and 1.0
406
+ 3. Use these normalized coordinates in your tool calls
407
+
408
+ **Never use pixel coordinates directly - always use normalized coordinates between 0.0 and 1.0**
409
+ </coordinate_system>
410
+
411
+ <web_form_guidelines>
412
+ Look at the elements on the screen (input fields, checkboxes, buttons, dropdowns) to decide where to interact.
413
+ **Always use normalized coordinates (0.0 to 1.0) based on the element's relative position on the screen.**
414
+
415
+ ### Typical Web Form Interactions
416
+ - **Input fields**: click in the field first to focus it, then use `write("text")`.
417
+ - **Passwords**: type them just like text — `write("password123")`.
418
+ - **Checkboxes / radio buttons**: use `click(x,y)` directly on the box/circle.
419
+ - **Dropdown menus**: click to open, then click the option or use arrow keys + press("enter").
420
+ - **Submit buttons**: identify clearly labelled "Sign up", "Sign in", "Submit" buttons and click at their normalized coordinates.
421
+ - **Captcha or 2FA**: wait or prompt for external handling (do not bypass security).
422
+ - **Pop-ups**: try `press("enter")` to confirm or `press("escape")` to close if they block your action.
423
+
424
+ ### Grouping Multiple Inputs
425
+ - If multiple fields, checkboxes, or similar controls are clearly visible **and** can be filled/clicked in sequence without ambiguity, you may include **several actions in one code block**.
426
+ - Keep each Action still in a **Python code block** but with multiple tool calls inside, for example:
427
+ ```python
428
+ click(0.47, 0.30) # Email field (normalized coordinates)
429
+ wait(0.1)
430
+ write("[email protected]")
431
+ click(0.47, 0.35) # Password field (normalized coordinates)
432
+ wait(0.1)
433
+ write("mypassword123")
434
+ click(0.45, 0.55) # Checkbox "Accept terms" (normalized coordinates)
435
+ wait(0.1)
436
+ ```<end_code>
437
+ - Only group actions when:
438
+ 1. They're all part of the **same form or step**,
439
+ 2. The screenshot clearly shows all elements and coordinates,
440
+ 3. The order of operations is obvious.
441
+ - Otherwise, default back to one Action per step.
442
+
443
+ ### Precision
444
+ - Always **click before typing** to ensure the right field is active.
445
+ - Always **scroll if needed** to bring elements into view before clicking.
446
+ - Always **validate each action** via the screenshot before continuing.
447
+ - Always use **normalized coordinates between 0.0 and 1.0**.
448
+ </web_form_guidelines>
449
+
450
+ <task_resolution_example>
451
+ For a task like "Sign up for an account and submit the form":
452
+
453
+ Step 1:
454
+ Short term goal: I want to open the signup page.
455
+ What I see: The browser is open on the homepage.
456
+ Reflection: I will open the signup URL directly.
457
+ Action:
458
+ ```python
459
+ open("https://example.com/signup")
460
+ wait(3)
461
+ ```<end_code>
462
+
463
+ Step 2:
464
+ Short term goal: I want to fill the "Email" field.
465
+ What I see: I see the signup form with an "Email" field roughly in the center-left of the screen.
466
+ Reflection: I will click inside the field (approximately 0.47, 0.30 in normalized coordinates) then type my email.
467
+ Action:
468
+ ```python
469
+ click(0.47, 0.30)
470
+ write("[email protected]")
471
+ ```<end_code>
472
+
473
+ Step 3:
474
+ Short term goal: I want to check the "I accept terms" checkbox.
475
+ What I see: The checkbox is in the lower portion of the form, around 0.45, 0.55 in normalized coordinates.
476
+ Reflection: I will click it.
477
+ Action:
478
+ ```python
479
+ click(0.45, 0.55)
480
+ ```<end_code>
481
+
482
+ Step 4:
483
+ Short term goal: I want to submit the form.
484
+ What I see: The "Sign Up" button is at the bottom center, around 0.52, 0.65 in normalized coordinates.
485
+ Reflection: I will click the button to submit.
486
+ Action:
487
+ ```python
488
+ click(0.52, 0.65)
489
+ wait(3)
490
+ ```<end_code>
491
+
492
+ Step 5:
493
+ Short term goal: Verify signup completed.
494
+ What I see: A confirmation page "Welcome [email protected]".
495
+ Reflection: Task complete.
496
+ Action:
497
+ ```python
498
+ final_answer("Signup completed")
499
+ ```<end_code>
500
+ </task_resolution_example>
501
+
502
+ <general_guidelines>
503
+ # GUI Agent Guidelines for Web Forms (Normalized Coordinates)
504
+
505
+ ## Environment Overview
506
+ Ubuntu 22.04 XFCE4 desktop with Google Chrome/Chromium browser.
507
+ Agent can fill forms, sign up, sign in, click checkboxes, submit claims.
508
+ **All coordinates are normalized between 0.0 and 1.0.**
509
+
510
+ ## Core Principles
511
+
512
+ ### 1. Screenshot Analysis
513
+ - Always analyze the latest screenshot carefully before each action.
514
+ - Validate that previous actions worked by examining the current state.
515
+ - If an action didn't work, try an alternative rather than repeating blindly.
516
+
517
+ ### 2. Action Execution
518
+ - Execute one action at a time.
519
+ - Wait for appropriate loading times using `wait()` but not indefinitely.
520
+ - Scroll to bring hidden elements into view.
521
+
522
+ ### 3. Coordinate System
523
+ - **CRITICAL**: Always use normalized coordinates (0.0 to 1.0)
524
+ - Convert visual position on screen to normalized coordinates
525
+ - Center of screen = (0.5, 0.5)
526
+ - Top-left = (0.0, 0.0), Bottom-right = (1.0, 1.0)
527
+
528
+ ### 4. Keyboard Shortcuts
529
+ - Use `tab` to move between fields, `space` to toggle checkboxes, `enter` to submit forms.
530
+ - Copy/paste: `ctrl+C`, `ctrl+V`.
531
+ - Refresh page: `refresh()`.
532
+
533
+ ### 5. Error Recovery
534
+ - If clicking doesn't work, try double_click or right_click.
535
+ - If typing doesn't appear, ensure the field is focused with click.
536
+ - If popups block the screen, try `press("enter")` or `press("escape")`.
537
+
538
+ ### 6. Security & Privacy
539
+ - Don't attempt to bypass captchas or 2FA automatically.
540
+ - Don't store credentials in plain text unless instructed.
541
+
542
+ ### 7. Final Answer
543
+ - When the form is successfully submitted or the goal achieved, use:
544
+ ```python
545
+ final_answer("Done")
546
+ ```<end_code>
547
+ </general_guidelines>
548
+ """
cua2-core/src/cua2-core/services/models/anthropic.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents import LiteLLMModel
2
+
3
+
4
+ class AnthropicModel(LiteLLMModel):
5
+ """Anthropic model"""
6
+
7
+ MODEL_TYPE = "anthropic"
8
+
9
+ def __init__(self, model_id: str):
10
+ super().__init__(model_id=model_id)
cua2-core/src/cua2-core/services/models/gemini.py ADDED
File without changes
cua2-core/src/cua2-core/services/models/get_model.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents import Model
2
+
3
+ from backend.models.models import AgentType
4
+ from backend.services.models.anthropic import AnthropicModel
5
+
6
+
7
+ def get_model(model_id: str) -> tuple[Model, AgentType]:
8
+ """Get the model"""
9
+ if "sonnet" in model_id:
10
+ return AnthropicModel(model_id=model_id), AgentType.PIXEL_COORDINATES
11
+ else:
12
+ raise ValueError(f"Model {model_id} not found")
cua2-core/src/cua2-core/services/models/qwen.py ADDED
File without changes
cua2-core/src/cua2-core/websocket/websocket_manager.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ from typing import Dict, Optional, Set
4
+
5
+ from fastapi import WebSocket
6
+
7
+ from backend.models.models import AgentMetadata, WebSocketEvent
8
+
9
+
10
+ class WebSocketManager:
11
+ """Manages WebSocket connections and broadcasting"""
12
+
13
+ def __init__(self):
14
+ self.active_connections: Set[WebSocket] = set()
15
+ self.connection_tasks: Dict[WebSocket, asyncio.Task] = {}
16
+
17
+ async def connect(self, websocket: WebSocket):
18
+ """Accept a new WebSocket connection"""
19
+ await websocket.accept()
20
+ self.active_connections.add(websocket)
21
+ print(f"WebSocket connected. Total connections: {len(self.active_connections)}")
22
+
23
+ def disconnect(self, websocket: WebSocket):
24
+ """Remove a WebSocket connection"""
25
+ self.active_connections.discard(websocket)
26
+ if websocket in self.connection_tasks:
27
+ self.connection_tasks[websocket].cancel()
28
+ del self.connection_tasks[websocket]
29
+ print(
30
+ f"WebSocket disconnected. Total connections: {len(self.active_connections)}"
31
+ )
32
+
33
+ async def send_personal_message(
34
+ self, message: WebSocketEvent, websocket: WebSocket
35
+ ):
36
+ """Send a message to a specific WebSocket connection"""
37
+ try:
38
+ await websocket.send_text(json.dumps(message.model_dump()))
39
+ except Exception as e:
40
+ print(f"Error sending personal message: {e}")
41
+ # Only disconnect if the connection is still in our set
42
+ if websocket in self.active_connections:
43
+ self.disconnect(websocket)
44
+
45
+ async def broadcast(self, message: WebSocketEvent):
46
+ """Broadcast a message to all connected WebSockets"""
47
+ if not self.active_connections:
48
+ return
49
+
50
+ # Create a list of connections to remove if they fail
51
+ disconnected = []
52
+
53
+ for connection in self.active_connections.copy():
54
+ try:
55
+ await connection.send_text(json.dumps(message.model_dump()))
56
+ except Exception as e:
57
+ print(f"Error broadcasting to connection: {e}")
58
+ disconnected.append(connection)
59
+
60
+ # Remove failed connections
61
+ for connection in disconnected:
62
+ if connection in self.active_connections:
63
+ self.disconnect(connection)
64
+
65
+ async def send_agent_start(self, content: str, message_id: str):
66
+ """Send agent start event"""
67
+ event = WebSocketEvent(
68
+ type="agent_start", content=content, messageId=message_id
69
+ )
70
+ await self.broadcast(event)
71
+
72
+ async def send_agent_progress(self, content: str, message_id: str):
73
+ """Send agent progress event"""
74
+ event = WebSocketEvent(
75
+ type="agent_progress", content=content, messageId=message_id
76
+ )
77
+ await self.broadcast(event)
78
+
79
+ async def send_agent_complete(
80
+ self, content: str, message_id: str, metadata: Optional[AgentMetadata] = None
81
+ ):
82
+ """Send agent complete event"""
83
+ event = WebSocketEvent(
84
+ type="agent_complete",
85
+ content=content,
86
+ messageId=message_id,
87
+ metadata=metadata,
88
+ )
89
+ await self.broadcast(event)
90
+
91
+ async def send_agent_error(self, content: str, message_id: Optional[str] = None):
92
+ """Send agent error event"""
93
+ event = WebSocketEvent(
94
+ type="agent_error", content=content, messageId=message_id
95
+ )
96
+ await self.broadcast(event)
97
+
98
+ async def send_vnc_url_set(self, vnc_url: str, content: Optional[str] = None):
99
+ """Send VNC URL set event"""
100
+ event = WebSocketEvent(
101
+ type="vnc_url_set",
102
+ content=content or f"VNC stream available at: {vnc_url}",
103
+ vncUrl=vnc_url,
104
+ )
105
+ await self.broadcast(event)
106
+
107
+ async def send_vnc_url_unset(self, content: Optional[str] = None):
108
+ """Send VNC URL unset event (reset to default display)"""
109
+ event = WebSocketEvent(
110
+ type="vnc_url_unset",
111
+ content=content or "VNC stream disconnected, showing default display",
112
+ )
113
+ await self.broadcast(event)
114
+
115
+ def get_connection_count(self) -> int:
116
+ """Get the number of active connections"""
117
+ return len(self.active_connections)
cua2-front/.gitignore ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Logs
2
+ logs
3
+ *.log
4
+ npm-debug.log*
5
+ yarn-debug.log*
6
+ yarn-error.log*
7
+ pnpm-debug.log*
8
+ lerna-debug.log*
9
+
10
+ node_modules
11
+ dist
12
+ dist-ssr
13
+ *.local
14
+
15
+ # Editor directories and files
16
+ .vscode/*
17
+ !.vscode/extensions.json
18
+ .idea
19
+ .DS_Store
20
+ *.suo
21
+ *.ntvs*
22
+ *.njsproj
23
+ *.sln
24
+ *.sw?
cua2-front/index.html ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <link rel="icon" type="image/svg+xml" href="/favicon.ico" />
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
+ <title>CUA2</title>
8
+ </head>
9
+ <body>
10
+ <div id="root"></div>
11
+ <script type="module" src="/src/main.tsx"></script>
12
+ </body>
13
+ </html>
14
+
cua2-front/package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
cua2-front/package.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "cua2-front",
3
+ "private": true,
4
+ "version": "0.0.0",
5
+ "type": "module",
6
+ "scripts": {
7
+ "dev": "vite",
8
+ "build": "vite build",
9
+ "build:dev": "vite build --mode development",
10
+ "lint": "eslint src/ --config src/eslint.config.js",
11
+ "type-check": "tsc --noEmit --project src/tsconfig.json",
12
+ "preview": "vite preview"
13
+ },
14
+ "dependencies": {
15
+ "react": "^18.3.1",
16
+ "react-router-dom": "^6.30.1",
17
+ "react-dom": "^18.3.1"
18
+ },
19
+ "devDependencies": {
20
+ "@eslint/js": "^9.32.0",
21
+ "@types/node": "^22.16.5",
22
+ "@types/react": "^18.3.23",
23
+ "@types/react-dom": "^18.3.7",
24
+ "@vitejs/plugin-react-swc": "^3.11.0",
25
+ "autoprefixer": "^10.4.21",
26
+ "eslint": "^9.32.0",
27
+ "eslint-plugin-react-hooks": "^5.2.0",
28
+ "eslint-plugin-react-refresh": "^0.4.20",
29
+ "globals": "^15.15.0",
30
+ "typescript-eslint": "^8.38.0",
31
+ "vite": "^5.4.19"
32
+ }
33
+ }
cua2-front/src/App.tsx ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import React from 'react';
2
+ import { BrowserRouter, Routes, Route } from "react-router-dom";
3
+ import Index from "./pages/Index";
4
+
5
+ const App = () => (
6
+ <BrowserRouter>
7
+ <Routes>
8
+ <Route path="/" element={<Index />} />
9
+ {/* ADD ALL CUSTOM ROUTES ABOVE THE CATCH-ALL "*" ROUTE */}
10
+ </Routes>
11
+ </BrowserRouter>
12
+
13
+ );
14
+
15
+ export default App;
cua2-front/src/hooks/useWebSocket.ts ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { WebSocketEvent } from '@/types/agent';
2
+ import { useCallback, useEffect, useRef, useState } from 'react';
3
+
4
+ interface UseWebSocketProps {
5
+ url: string;
6
+ onMessage: (event: WebSocketEvent) => void;
7
+ onError?: (error: Event) => void;
8
+ }
9
+
10
+ export const useWebSocket = ({ url, onMessage, onError }: UseWebSocketProps) => {
11
+ const [isConnected, setIsConnected] = useState(false);
12
+ const [connectionState, setConnectionState] = useState<'connecting' | 'connected' | 'disconnected' | 'error'>('disconnected');
13
+ const wsRef = useRef<WebSocket | null>(null);
14
+ const reconnectTimeoutRef = useRef<NodeJS.Timeout>();
15
+ const reconnectAttemptsRef = useRef(0);
16
+ const maxReconnectAttempts = 3; // Only try three times, then stop
17
+ const baseReconnectDelay = 3000; // Start with 3 seconds
18
+ const maxReconnectDelay = 5000; // Max 5 seconds
19
+ const lastErrorTimeRef = useRef(0);
20
+ const errorThrottleMs = 5000; // Only show error toast once every 5 seconds
21
+ const isInitialConnectionRef = useRef(true); // Track if this is the first connection attempt
22
+
23
+ const getReconnectDelay = () => {
24
+ // Exponential backoff with jitter
25
+ const delay = Math.min(
26
+ baseReconnectDelay * Math.pow(2, reconnectAttemptsRef.current),
27
+ maxReconnectDelay
28
+ );
29
+ return delay + Math.random() * 1000; // Add jitter
30
+ };
31
+
32
+ const connect = useCallback(() => {
33
+ if (wsRef.current?.readyState === WebSocket.OPEN || wsRef.current?.readyState === WebSocket.CONNECTING) {
34
+ return; // Already connected or connecting
35
+ }
36
+
37
+ try {
38
+ setConnectionState('connecting');
39
+ const ws = new WebSocket(url);
40
+
41
+ ws.onopen = () => {
42
+ console.log('WebSocket connected');
43
+ setIsConnected(true);
44
+ setConnectionState('connected');
45
+ reconnectAttemptsRef.current = 0; // Reset attempts on successful connection
46
+ isInitialConnectionRef.current = false; // Mark that we've had a successful connection
47
+ };
48
+
49
+ ws.onmessage = (event) => {
50
+ try {
51
+ const data = JSON.parse(event.data) as WebSocketEvent;
52
+ onMessage(data);
53
+ } catch (error) {
54
+ console.error('Failed to parse WebSocket message:', error);
55
+ }
56
+ };
57
+
58
+ ws.onerror = (error) => {
59
+ console.error('WebSocket error:', error);
60
+ setConnectionState('error');
61
+
62
+ // Don't show error toasts on initial connection failure
63
+ // Only show toasts after we've had a successful connection before
64
+ if (!isInitialConnectionRef.current) {
65
+ // Throttle error notifications
66
+ const now = Date.now();
67
+ if (now - lastErrorTimeRef.current > errorThrottleMs) {
68
+ lastErrorTimeRef.current = now;
69
+ onError?.(error);
70
+ }
71
+ }
72
+ };
73
+
74
+ ws.onclose = (event) => {
75
+ console.log('WebSocket disconnected', { code: event.code, reason: event.reason });
76
+ setIsConnected(false);
77
+ setConnectionState('disconnected');
78
+
79
+ // Only attempt to reconnect if it wasn't a manual close (code 1000) and we haven't exceeded max attempts
80
+ if (event.code !== 1000 && reconnectAttemptsRef.current < maxReconnectAttempts) {
81
+ const delay = getReconnectDelay();
82
+ console.log(`Attempting to reconnect in ${Math.round(delay)}ms (attempt ${reconnectAttemptsRef.current + 1}/${maxReconnectAttempts})`);
83
+
84
+ reconnectTimeoutRef.current = setTimeout(() => {
85
+ reconnectAttemptsRef.current++;
86
+ connect();
87
+ }, delay);
88
+ } else if (reconnectAttemptsRef.current >= maxReconnectAttempts) {
89
+ console.log('Max reconnection attempts reached');
90
+ setConnectionState('error');
91
+ } else if (event.code === 1000) {
92
+ // Normal closure - don't reconnect
93
+ setConnectionState('disconnected');
94
+ console.log('WebSocket closed normally, not reconnecting');
95
+ }
96
+ };
97
+
98
+ wsRef.current = ws;
99
+ } catch (error) {
100
+ console.error('Failed to create WebSocket connection:', error);
101
+ setConnectionState('error');
102
+ }
103
+ }, [url, onMessage, onError]);
104
+
105
+ const disconnect = useCallback(() => {
106
+ if (reconnectTimeoutRef.current) {
107
+ clearTimeout(reconnectTimeoutRef.current);
108
+ }
109
+ if (wsRef.current) {
110
+ wsRef.current.close(1000, 'Manual disconnect');
111
+ wsRef.current = null;
112
+ }
113
+ setIsConnected(false);
114
+ setConnectionState('disconnected');
115
+ reconnectAttemptsRef.current = 0;
116
+ }, []);
117
+
118
+ const manualReconnect = useCallback(() => {
119
+ console.log('Manual reconnect requested');
120
+ disconnect();
121
+ reconnectAttemptsRef.current = 0;
122
+ isInitialConnectionRef.current = false; // Allow error toasts on manual reconnect
123
+ setTimeout(() => connect(), 1000); // Small delay before reconnecting
124
+ }, [disconnect, connect]);
125
+
126
+ const sendMessage = (message: unknown) => {
127
+ if (wsRef.current?.readyState === WebSocket.OPEN) {
128
+ try {
129
+ wsRef.current.send(JSON.stringify(message));
130
+ } catch (error) {
131
+ console.error('Failed to send WebSocket message:', error);
132
+ }
133
+ } else {
134
+ console.warn('WebSocket is not connected');
135
+ }
136
+ };
137
+
138
+ useEffect(() => {
139
+ connect();
140
+
141
+ return () => {
142
+ disconnect();
143
+ };
144
+ }, [url]); // Only depend on url, not the functions
145
+
146
+ return {
147
+ isConnected,
148
+ connectionState,
149
+ sendMessage,
150
+ reconnect: connect,
151
+ disconnect,
152
+ manualReconnect
153
+ };
154
+ };
cua2-front/src/index.css ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ * {
2
+ margin: 0;
3
+ padding: 0;
4
+ box-sizing: border-box;
5
+ }
6
+
7
+ body {
8
+ margin: 0;
9
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
10
+ 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
11
+ sans-serif;
12
+ -webkit-font-smoothing: antialiased;
13
+ -moz-osx-font-smoothing: grayscale;
14
+ }
15
+
16
+ #root {
17
+ width: 100%;
18
+ height: 100vh;
19
+ }
20
+
cua2-front/src/main.tsx ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import { createRoot } from "react-dom/client";
2
+ import App from "./App.tsx";
3
+ import "./index.css";
4
+
5
+ createRoot(document.getElementById("root")!).render(<App />);
cua2-front/src/pages/Index.tsx ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import React from 'react';
2
+ import { useWebSocket } from '@/hooks/useWebSocket';
3
+ import { AgentMessage, WebSocketEvent } from '@/types/agent';
4
+ import { useEffect, useState } from 'react';
5
+
6
+ const Index = () => {
7
+ const [messages, setMessages] = useState<AgentMessage[]>([]);
8
+ const [isAgentProcessing, setIsAgentProcessing] = useState(false);
9
+ const [vncUrl, setVncUrl] = useState<string>('');
10
+
11
+ // WebSocket connection - Use environment variable for flexibility across environments
12
+ // const WS_URL = process.env.NEXT_PUBLIC_WS_URL || 'ws://localhost:8000/ws';
13
+ const WS_URL = 'ws://localhost:8000/ws';
14
+
15
+ const handleWebSocketMessage = (event: WebSocketEvent) => {
16
+ console.log('WebSocket event received:', event);
17
+
18
+ switch (event.type) {
19
+ case 'agent_start':
20
+ setIsAgentProcessing(true);
21
+ if (event.content) {
22
+ const newMessage: AgentMessage = {
23
+ id: event.messageId,
24
+ type: 'agent',
25
+ instructions: event.instructions,
26
+ modelId: event.modelId,
27
+ timestamp: new Date(),
28
+ isLoading: true,
29
+ };
30
+ setMessages(prev => [...prev, newMessage]);
31
+ }
32
+ break;
33
+
34
+ case 'agent_progress':
35
+ if (event.messageId && event.agentStep) {
36
+ // Add new step from a agent trace run with image, generated text, actions, tokens and timestamp
37
+ setMessages(prev =>
38
+ prev.map(msg => {
39
+ if (msg.id === event.agentStep.messageId) {
40
+ const existingSteps = msg.steps || [];
41
+ const stepExists = existingSteps.some(step => step.stepId === event.agentStep.stepId);
42
+
43
+ if (!stepExists) {
44
+ return { ...msg, steps: [...existingSteps, event.agentStep], isLoading: true };
45
+ }
46
+ return msg;
47
+ }
48
+ return msg;
49
+ })
50
+ );
51
+ }
52
+ break;
53
+
54
+ case 'agent_complete':
55
+ setIsAgentProcessing(false);
56
+ if (event.messageId && event.metadata) {
57
+ setMessages(prev =>
58
+ prev.map(msg =>
59
+ msg.id === event.metadata.messageId
60
+ ? {
61
+ ...msg,
62
+ isLoading: false,
63
+ metadata: event.metadata,
64
+ }
65
+ : msg
66
+ )
67
+ );
68
+ }
69
+ break;
70
+
71
+ case 'agent_error':
72
+ setIsAgentProcessing(false);
73
+ // TODO: Handle agent error
74
+ break;
75
+
76
+ case 'vnc_url_set':
77
+ if (event.vncUrl) {
78
+ setVncUrl(event.vncUrl);
79
+ }
80
+ // TODO: Handle VNC URL set
81
+ break;
82
+
83
+ case 'vnc_url_unset':
84
+ setVncUrl('');
85
+ // TODO: Handle VNC URL unset
86
+ break;
87
+
88
+ case 'heartbeat':
89
+ console.log('Heartbeat received:', event);
90
+ break;
91
+ }
92
+ };
93
+
94
+ const handleWebSocketError = () => {
95
+ // Error handling is now throttled in the WebSocket hook
96
+
97
+ };
98
+
99
+ const { isConnected, connectionState, sendMessage, manualReconnect } = useWebSocket({
100
+ url: WS_URL,
101
+ onMessage: handleWebSocketMessage,
102
+ onError: handleWebSocketError,
103
+ });
104
+
105
+ const handleSendMessage = (content: string) => {
106
+ const userMessage: AgentMessage = {
107
+ id: Date.now().toString(),
108
+ type: 'user',
109
+ content,
110
+ timestamp: new Date(),
111
+ };
112
+
113
+ setMessages(prev => [...prev, userMessage]);
114
+
115
+ // Send message to Python backend via WebSocket
116
+ sendMessage({
117
+ type: 'user_task',
118
+ content,
119
+ model_id: "anthropic/claude-sonnet-4-5-20250929",
120
+ timestamp: new Date().toISOString(),
121
+ });
122
+ };
123
+
124
+
125
+ return (
126
+ <div>
127
+ <h1>Hello World</h1>
128
+ </div>
129
+ );
130
+ };
131
+
132
+ export default Index;
cua2-front/src/types/agent.ts ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export interface AgentMessage {
2
+ id: string;
3
+ type: 'user' | 'agent';
4
+ timestamp: Date;
5
+ instructions: string;
6
+ modelId: string;
7
+ steps?: AgentStep[];
8
+ metadata?: AgentMetadata;
9
+ isLoading?: boolean;
10
+ }
11
+
12
+ export interface AgentStep {
13
+ messageId: string;
14
+ stepId: string;
15
+ image: string;
16
+ generatedText: string;
17
+ actions: string[];
18
+ inputTokensUsed: number;
19
+ outputTokensUsed: number;
20
+ timestamp: Date;
21
+ }
22
+
23
+ export interface AgentMetadata {
24
+ messageId: string;
25
+ inputTokensUsed: number;
26
+ outputTokensUsed: number;
27
+ timeTaken: number;
28
+ numberOfSteps: number;
29
+ }
30
+
31
+ export interface WebSocketEvent {
32
+ type: 'agent_start' | 'agent_progress' | 'agent_complete' | 'agent_error' | 'vnc_url_set' | 'vnc_url_unset' | 'heartbeat';
33
+ agentStep?: AgentStep;
34
+ metadata?: AgentMetadata;
35
+ vncUrl?: string;
36
+ }
cua2-front/src/vite-env.d.ts ADDED
@@ -0,0 +1 @@
 
 
1
+ /// <reference types="vite/client" />
cua2-front/tsconfig.app.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2020",
4
+ "useDefineForClassFields": true,
5
+ "lib": [
6
+ "ES2020",
7
+ "DOM",
8
+ "DOM.Iterable"
9
+ ],
10
+ "module": "ESNext",
11
+ "skipLibCheck": true,
12
+ /* Bundler mode */
13
+ "moduleResolution": "bundler",
14
+ "allowImportingTsExtensions": true,
15
+ "isolatedModules": true,
16
+ "moduleDetection": "force",
17
+ "noEmit": true,
18
+ "jsx": "react-jsx",
19
+ /* Linting */
20
+ "strict": false,
21
+ "noUnusedLocals": false,
22
+ "noUnusedParameters": false,
23
+ "noImplicitAny": false,
24
+ "noFallthroughCasesInSwitch": false,
25
+ "baseUrl": ".",
26
+ "paths": {
27
+ "@/*": [
28
+ "./src/*"
29
+ ]
30
+ }
31
+ },
32
+ "include": [
33
+ "src",
34
+ ]
35
+ }
cua2-front/tsconfig.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "references": [{ "path": "./tsconfig.app.json" }, { "path": "./tsconfig.node.json" }],
4
+ "compilerOptions": {
5
+ "baseUrl": ".",
6
+ "paths": {
7
+ "@/*": ["./src/*"]
8
+ },
9
+ "noImplicitAny": false,
10
+ "noUnusedParameters": false,
11
+ "skipLibCheck": true,
12
+ "allowJs": true,
13
+ "noUnusedLocals": false,
14
+ "strictNullChecks": false
15
+ }
16
+ }
cua2-front/tsconfig.node.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2022",
4
+ "lib": ["ES2023"],
5
+ "module": "ESNext",
6
+ "skipLibCheck": true,
7
+
8
+ /* Bundler mode */
9
+ "moduleResolution": "bundler",
10
+ "allowImportingTsExtensions": true,
11
+ "isolatedModules": true,
12
+ "moduleDetection": "force",
13
+ "noEmit": true,
14
+
15
+ /* Linting */
16
+ "strict": true,
17
+ "noUnusedLocals": false,
18
+ "noUnusedParameters": false,
19
+ "noFallthroughCasesInSwitch": true
20
+ },
21
+ "include": ["vite.config.ts"]
22
+ }
cua2-front/vite.config.ts ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { defineConfig } from "vite";
2
+ import react from "@vitejs/plugin-react-swc";
3
+ import path from "path";
4
+
5
+ // https://vitejs.dev/config/
6
+ export default defineConfig(({ mode }) => ({
7
+ server: {
8
+ host: "::",
9
+ port: 8080,
10
+ },
11
+ plugins: [react()],
12
+ resolve: {
13
+ alias: {
14
+ "@": path.resolve(__dirname, "./src"),
15
+ },
16
+ },
17
+ }));