Spaces:
Running
Running
Amir Mahla
commited on
Commit
·
af1ae43
0
Parent(s):
Init CUA2
Browse files- .gitignore +226 -0
- cua2-core/env.example +11 -0
- cua2-core/pyproject.toml +93 -0
- cua2-core/src/__init__.py +1 -0
- cua2-core/src/cua2-core/__init__.py +1 -0
- cua2-core/src/cua2-core/app.py +64 -0
- cua2-core/src/cua2-core/main.py +37 -0
- cua2-core/src/cua2-core/models/models.py +95 -0
- cua2-core/src/cua2-core/routes/routes.py +56 -0
- cua2-core/src/cua2-core/routes/websocket.py +86 -0
- cua2-core/src/cua2-core/services/agent_service.py +130 -0
- cua2-core/src/cua2-core/services/agents/get_agents.py +57 -0
- cua2-core/src/cua2-core/services/agents/normalized_1000_agent.py +293 -0
- cua2-core/src/cua2-core/services/agents/normalized_agent.py +282 -0
- cua2-core/src/cua2-core/services/agents/pixel_coordonates_agent.py +317 -0
- cua2-core/src/cua2-core/services/agents/prompt.py +548 -0
- cua2-core/src/cua2-core/services/models/anthropic.py +10 -0
- cua2-core/src/cua2-core/services/models/gemini.py +0 -0
- cua2-core/src/cua2-core/services/models/get_model.py +12 -0
- cua2-core/src/cua2-core/services/models/qwen.py +0 -0
- cua2-core/src/cua2-core/websocket/websocket_manager.py +117 -0
- cua2-front/.gitignore +24 -0
- cua2-front/index.html +14 -0
- cua2-front/package-lock.json +0 -0
- cua2-front/package.json +33 -0
- cua2-front/src/App.tsx +15 -0
- cua2-front/src/hooks/useWebSocket.ts +154 -0
- cua2-front/src/index.css +20 -0
- cua2-front/src/main.tsx +5 -0
- cua2-front/src/pages/Index.tsx +132 -0
- cua2-front/src/types/agent.ts +36 -0
- cua2-front/src/vite-env.d.ts +1 -0
- cua2-front/tsconfig.app.json +35 -0
- cua2-front/tsconfig.json +16 -0
- cua2-front/tsconfig.node.json +22 -0
- cua2-front/vite.config.ts +17 -0
.gitignore
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py,cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# UV
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
#uv.lock
|
| 102 |
+
|
| 103 |
+
# poetry
|
| 104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 106 |
+
# commonly ignored for libraries.
|
| 107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 108 |
+
#poetry.lock
|
| 109 |
+
|
| 110 |
+
# pdm
|
| 111 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 112 |
+
#pdm.lock
|
| 113 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 114 |
+
# in version control.
|
| 115 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
| 116 |
+
.pdm.toml
|
| 117 |
+
.pdm-python
|
| 118 |
+
.pdm-build/
|
| 119 |
+
|
| 120 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 121 |
+
__pypackages__/
|
| 122 |
+
|
| 123 |
+
# Celery stuff
|
| 124 |
+
celerybeat-schedule
|
| 125 |
+
celerybeat.pid
|
| 126 |
+
|
| 127 |
+
# SageMath parsed files
|
| 128 |
+
*.sage.py
|
| 129 |
+
|
| 130 |
+
# Environments
|
| 131 |
+
.env
|
| 132 |
+
.venv
|
| 133 |
+
env/
|
| 134 |
+
venv/
|
| 135 |
+
ENV/
|
| 136 |
+
env.bak/
|
| 137 |
+
venv.bak/
|
| 138 |
+
|
| 139 |
+
# Spyder project settings
|
| 140 |
+
.spyderproject
|
| 141 |
+
.spyproject
|
| 142 |
+
|
| 143 |
+
# Rope project settings
|
| 144 |
+
.ropeproject
|
| 145 |
+
|
| 146 |
+
# mkdocs documentation
|
| 147 |
+
/site
|
| 148 |
+
|
| 149 |
+
# mypy
|
| 150 |
+
.mypy_cache/
|
| 151 |
+
.dmypy.json
|
| 152 |
+
dmypy.json
|
| 153 |
+
|
| 154 |
+
# Pyre type checker
|
| 155 |
+
.pyre/
|
| 156 |
+
|
| 157 |
+
# pytype static type analyzer
|
| 158 |
+
.pytype/
|
| 159 |
+
|
| 160 |
+
# Cython debug symbols
|
| 161 |
+
cython_debug/
|
| 162 |
+
|
| 163 |
+
# PyCharm
|
| 164 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 165 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 166 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 167 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 168 |
+
#.idea/
|
| 169 |
+
|
| 170 |
+
# Abstra
|
| 171 |
+
# Abstra is an AI-powered process automation framework.
|
| 172 |
+
# Ignore directories containing user credentials, local state, and settings.
|
| 173 |
+
# Learn more at https://abstra.io/docs
|
| 174 |
+
.abstra/
|
| 175 |
+
|
| 176 |
+
# Visual Studio Code
|
| 177 |
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
| 178 |
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
| 179 |
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
| 180 |
+
# you could uncomment the following to ignore the enitre vscode folder
|
| 181 |
+
# .vscode/
|
| 182 |
+
|
| 183 |
+
# Ruff stuff:
|
| 184 |
+
.ruff_cache/
|
| 185 |
+
|
| 186 |
+
# PyPI configuration file
|
| 187 |
+
.pypirc
|
| 188 |
+
|
| 189 |
+
# Cursor
|
| 190 |
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
| 191 |
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
| 192 |
+
# refer to https://docs.cursor.com/context/ignore-files
|
| 193 |
+
.cursorignore
|
| 194 |
+
.cursorindexingignore
|
| 195 |
+
|
| 196 |
+
gui_agent_demo.mp4
|
| 197 |
+
|
| 198 |
+
recording.mp4
|
| 199 |
+
|
| 200 |
+
uv.lock
|
| 201 |
+
.DS_Store
|
| 202 |
+
|
| 203 |
+
# Logs
|
| 204 |
+
logs
|
| 205 |
+
*.log
|
| 206 |
+
npm-debug.log*
|
| 207 |
+
yarn-debug.log*
|
| 208 |
+
yarn-error.log*
|
| 209 |
+
pnpm-debug.log*
|
| 210 |
+
lerna-debug.log*
|
| 211 |
+
|
| 212 |
+
node_modules
|
| 213 |
+
dist
|
| 214 |
+
dist-ssr
|
| 215 |
+
*.local
|
| 216 |
+
|
| 217 |
+
# Editor directories and files
|
| 218 |
+
.vscode/*
|
| 219 |
+
!.vscode/extensions.json
|
| 220 |
+
.idea
|
| 221 |
+
.DS_Store
|
| 222 |
+
*.suo
|
| 223 |
+
*.ntvs*
|
| 224 |
+
*.njsproj
|
| 225 |
+
*.sln
|
| 226 |
+
*.sw?
|
cua2-core/env.example
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment Configuration
|
| 2 |
+
HOST=0.0.0.0
|
| 3 |
+
PORT=8000
|
| 4 |
+
DEBUG=true
|
| 5 |
+
|
| 6 |
+
# Agent Configuration
|
| 7 |
+
AGENT_TIMEOUT=300
|
| 8 |
+
MAX_CONCURRENT_TASKS=5
|
| 9 |
+
|
| 10 |
+
# Logging
|
| 11 |
+
LOG_LEVEL=INFO
|
cua2-core/pyproject.toml
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["hatchling"]
|
| 3 |
+
build-backend = "hatchling.build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "cua2-core"
|
| 7 |
+
version = "0.0.0-dev.0"
|
| 8 |
+
description = "Backend API server for Computer Use Agent"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
authors = [{ name = "Amir Mahla", email = "[email protected]" }]
|
| 11 |
+
keywords = ["fastapi", "api", "backend", "automation"]
|
| 12 |
+
classifiers = [
|
| 13 |
+
"Development Status :: 4 - Beta",
|
| 14 |
+
"Intended Audience :: Developers",
|
| 15 |
+
"License :: OSI Approved :: MIT License",
|
| 16 |
+
"Operating System :: OS Independent",
|
| 17 |
+
"Programming Language :: Python :: 3",
|
| 18 |
+
"Programming Language :: Python :: 3.10",
|
| 19 |
+
"Programming Language :: Python :: 3.11",
|
| 20 |
+
"Programming Language :: Python :: 3.12",
|
| 21 |
+
"Topic :: Internet :: WWW/HTTP :: HTTP Servers",
|
| 22 |
+
"Topic :: Software Development :: Libraries :: Application Frameworks",
|
| 23 |
+
]
|
| 24 |
+
requires-python = ">=3.10"
|
| 25 |
+
dependencies = [
|
| 26 |
+
"fastapi>=0.115.13",
|
| 27 |
+
"uvicorn[standard]>=0.29.0,<0.30.0",
|
| 28 |
+
"websockets>=13.1.0,<14.0.0",
|
| 29 |
+
"pydantic>=2.11.7",
|
| 30 |
+
"python-multipart>=0.0.18,<0.0.19",
|
| 31 |
+
"python-jose[cryptography]==3.3.0",
|
| 32 |
+
"passlib[bcrypt]==1.7.4",
|
| 33 |
+
"python-dotenv==1.0.0",
|
| 34 |
+
"httpx>=0.27.1",
|
| 35 |
+
"asyncio-mqtt==0.16.1",
|
| 36 |
+
"aiofiles==23.2.1",
|
| 37 |
+
"smolagents[openai,litellm]==1.15.0",
|
| 38 |
+
"openai==1.91.0",
|
| 39 |
+
"litellm[proxy]==1.63.14",
|
| 40 |
+
]
|
| 41 |
+
|
| 42 |
+
[project.optional-dependencies]
|
| 43 |
+
dev = [
|
| 44 |
+
"pytest>=7.0.0",
|
| 45 |
+
"pytest-asyncio>=0.21.0",
|
| 46 |
+
"pytest-cov>=4.0.0",
|
| 47 |
+
"black>=23.0.0",
|
| 48 |
+
"isort>=5.12.0",
|
| 49 |
+
"flake8>=6.0.0",
|
| 50 |
+
"mypy>=1.0.0",
|
| 51 |
+
"pre-commit>=3.0.0",
|
| 52 |
+
]
|
| 53 |
+
test = [
|
| 54 |
+
"pytest>=7.0.0",
|
| 55 |
+
"pytest-asyncio>=0.21.0",
|
| 56 |
+
"pytest-cov>=4.0.0",
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
[project.urls]
|
| 60 |
+
Homepage = "https://github.com/huggingface/CUA2"
|
| 61 |
+
Repository = "https://github.com/huggingface/CUA2"
|
| 62 |
+
|
| 63 |
+
[tool.hatch.build.targets.wheel]
|
| 64 |
+
packages = ["src/cua2-core"]
|
| 65 |
+
|
| 66 |
+
[tool.hatch.build.targets.sdist]
|
| 67 |
+
include = [
|
| 68 |
+
"/src",
|
| 69 |
+
"/README.md",
|
| 70 |
+
]
|
| 71 |
+
|
| 72 |
+
[tool.coverage.run]
|
| 73 |
+
source = ["src"]
|
| 74 |
+
omit = [
|
| 75 |
+
"*/tests/*",
|
| 76 |
+
"*/test_*",
|
| 77 |
+
"*/__pycache__/*",
|
| 78 |
+
"*/migrations/*",
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
[tool.coverage.report]
|
| 82 |
+
exclude_lines = [
|
| 83 |
+
"pragma: no cover",
|
| 84 |
+
"def __repr__",
|
| 85 |
+
"if self.debug:",
|
| 86 |
+
"if settings.DEBUG",
|
| 87 |
+
"raise AssertionError",
|
| 88 |
+
"raise NotImplementedError",
|
| 89 |
+
"if 0:",
|
| 90 |
+
"if __name__ == .__main__.:",
|
| 91 |
+
"class .*\\bProtocol\\):",
|
| 92 |
+
"@(abc\\.)?abstractmethod",
|
| 93 |
+
]
|
cua2-core/src/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Backend package
|
cua2-core/src/cua2-core/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Backend package
|
cua2-core/src/cua2-core/app.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from contextlib import asynccontextmanager
|
| 2 |
+
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from fastapi import FastAPI
|
| 5 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 6 |
+
|
| 7 |
+
from backend.services.agent_service import AgentService
|
| 8 |
+
from backend.websocket.websocket_manager import WebSocketManager
|
| 9 |
+
|
| 10 |
+
# Load environment variables
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@asynccontextmanager
|
| 15 |
+
async def lifespan(app: FastAPI):
|
| 16 |
+
"""Lifespan context manager for startup and shutdown events"""
|
| 17 |
+
# Startup: Initialize services
|
| 18 |
+
print("Initializing services...")
|
| 19 |
+
|
| 20 |
+
# Initialize WebSocket manager
|
| 21 |
+
websocket_manager = WebSocketManager()
|
| 22 |
+
|
| 23 |
+
# Initialize agent service with websocket manager dependency
|
| 24 |
+
agent_service = AgentService(websocket_manager)
|
| 25 |
+
|
| 26 |
+
# Store services in app state for access in routes
|
| 27 |
+
app.state.websocket_manager = websocket_manager
|
| 28 |
+
app.state.agent_service = agent_service
|
| 29 |
+
|
| 30 |
+
print("Services initialized successfully")
|
| 31 |
+
|
| 32 |
+
yield
|
| 33 |
+
|
| 34 |
+
# Shutdown: Clean up resources
|
| 35 |
+
print("Shutting down services...")
|
| 36 |
+
# Add any cleanup logic here if needed
|
| 37 |
+
print("Services shut down successfully")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# Create FastAPI app with lifespan
|
| 41 |
+
app = FastAPI(
|
| 42 |
+
title="Computer Use Studio Backend",
|
| 43 |
+
description="Backend API for Computer Use Studio - AI-powered automation interface",
|
| 44 |
+
version="1.0.0",
|
| 45 |
+
docs_url="/docs",
|
| 46 |
+
redoc_url="/redoc",
|
| 47 |
+
lifespan=lifespan,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# Configure CORS
|
| 51 |
+
app.add_middleware(
|
| 52 |
+
CORSMiddleware,
|
| 53 |
+
allow_origins=[
|
| 54 |
+
"http://localhost:3000", # React dev server
|
| 55 |
+
"http://localhost:5173", # Vite dev server
|
| 56 |
+
"http://localhost:8080", # Alternative frontend port
|
| 57 |
+
"http://127.0.0.1:3000",
|
| 58 |
+
"http://127.0.0.1:5173",
|
| 59 |
+
"http://127.0.0.1:8080",
|
| 60 |
+
],
|
| 61 |
+
allow_credentials=True,
|
| 62 |
+
allow_methods=["*"],
|
| 63 |
+
allow_headers=["*"],
|
| 64 |
+
)
|
cua2-core/src/cua2-core/main.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
import uvicorn
|
| 4 |
+
|
| 5 |
+
from backend.app import app
|
| 6 |
+
from backend.routes.routes import router
|
| 7 |
+
from backend.routes.websocket import router as websocket_router
|
| 8 |
+
|
| 9 |
+
# Include routes
|
| 10 |
+
app.include_router(router, prefix="/api/v1")
|
| 11 |
+
app.include_router(websocket_router)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# Health check endpoint (without prefix)
|
| 15 |
+
@app.get("/health")
|
| 16 |
+
async def health():
|
| 17 |
+
return {"status": "healthy", "service": "computer-use-studio-backend"}
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
if __name__ == "__main__":
|
| 21 |
+
# Get configuration from environment variables
|
| 22 |
+
host = os.getenv("HOST", "0.0.0.0")
|
| 23 |
+
port = int(os.getenv("PORT", 8000))
|
| 24 |
+
debug = os.getenv("DEBUG", "false").lower() == "true"
|
| 25 |
+
|
| 26 |
+
print(f"Starting Computer Use Studio Backend on {host}:{port}")
|
| 27 |
+
print(f"Debug mode: {debug}")
|
| 28 |
+
print(f"API Documentation: http://{host}:{port}/docs")
|
| 29 |
+
print(f"WebSocket endpoint: ws://{host}:{port}/ws")
|
| 30 |
+
|
| 31 |
+
uvicorn.run(
|
| 32 |
+
"backend.app:app",
|
| 33 |
+
host=host,
|
| 34 |
+
port=port,
|
| 35 |
+
reload=debug,
|
| 36 |
+
log_level="info" if not debug else "debug",
|
| 37 |
+
)
|
cua2-core/src/cua2-core/models/models.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from enum import Enum
|
| 5 |
+
from typing import Literal, Optional
|
| 6 |
+
|
| 7 |
+
from pydantic import BaseModel, model_validator
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class AgentMetadata(BaseModel):
|
| 11 |
+
"""Metadata for agent execution"""
|
| 12 |
+
|
| 13 |
+
inputTokensUsed: int
|
| 14 |
+
outputTokensUsed: int
|
| 15 |
+
timeTaken: float # in seconds
|
| 16 |
+
numberOfSteps: int
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class AgentType(str, Enum):
|
| 20 |
+
"""Agent type"""
|
| 21 |
+
|
| 22 |
+
PIXEL_COORDINATES = "pixel_coordinates"
|
| 23 |
+
NORMALIZED_1000_COORDINATES = "normalized_1000_coordinates"
|
| 24 |
+
NORMALIZED_COORDINATES = "normalized_coordinates"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class ActiveTask(BaseModel):
|
| 28 |
+
"""Active task"""
|
| 29 |
+
|
| 30 |
+
message_id: str
|
| 31 |
+
content: str
|
| 32 |
+
model_id: str
|
| 33 |
+
start_time: datetime
|
| 34 |
+
status: str
|
| 35 |
+
|
| 36 |
+
@property
|
| 37 |
+
def trace_path(self):
|
| 38 |
+
"""Trace path"""
|
| 39 |
+
return f"data/trace-{self.message_id}-{self.model_id}"
|
| 40 |
+
|
| 41 |
+
@model_validator(mode="after")
|
| 42 |
+
def validate_model_id(self):
|
| 43 |
+
"""Validate model ID"""
|
| 44 |
+
os.makedirs(self.trace_path, exist_ok=True)
|
| 45 |
+
with open(f"{self.trace_path}/user_tasks.json", "w") as f:
|
| 46 |
+
json.dump(self.model_dump(mode="json"), f, indent=2)
|
| 47 |
+
|
| 48 |
+
return self
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class WebSocketEvent(BaseModel):
|
| 52 |
+
"""WebSocket event structure"""
|
| 53 |
+
|
| 54 |
+
type: Literal[
|
| 55 |
+
"agent_start",
|
| 56 |
+
"agent_progress",
|
| 57 |
+
"agent_complete",
|
| 58 |
+
"agent_error",
|
| 59 |
+
"vnc_url_set",
|
| 60 |
+
"vnc_url_unset",
|
| 61 |
+
"heartbeat",
|
| 62 |
+
]
|
| 63 |
+
content: Optional[str] = None
|
| 64 |
+
metadata: Optional[AgentMetadata] = None
|
| 65 |
+
messageId: Optional[str] = None
|
| 66 |
+
vncUrl: Optional[str] = None
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class UserTaskMessage(BaseModel):
|
| 70 |
+
"""Message sent from frontend to backend"""
|
| 71 |
+
|
| 72 |
+
type: Literal["user_task"]
|
| 73 |
+
content: str
|
| 74 |
+
model_id: str
|
| 75 |
+
timestamp: str
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class AgentMessage(BaseModel):
|
| 79 |
+
"""Agent message structure"""
|
| 80 |
+
|
| 81 |
+
id: str
|
| 82 |
+
type: Literal["user", "agent"]
|
| 83 |
+
content: str
|
| 84 |
+
timestamp: datetime
|
| 85 |
+
metadata: Optional[AgentMetadata] = None
|
| 86 |
+
isLoading: Optional[bool] = None
|
| 87 |
+
truncated: Optional[bool] = None
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class HealthResponse(BaseModel):
|
| 91 |
+
"""Health check response"""
|
| 92 |
+
|
| 93 |
+
status: str
|
| 94 |
+
timestamp: datetime
|
| 95 |
+
websocket_connections: int
|
cua2-core/src/cua2-core/routes/routes.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
|
| 3 |
+
from fastapi import APIRouter, Depends, HTTPException, Request
|
| 4 |
+
|
| 5 |
+
# Get services from app state
|
| 6 |
+
from backend.models.models import HealthResponse
|
| 7 |
+
from backend.services.agent_service import AgentService
|
| 8 |
+
from backend.websocket.websocket_manager import WebSocketManager
|
| 9 |
+
|
| 10 |
+
# Create router
|
| 11 |
+
router = APIRouter()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def get_websocket_manager(request: Request) -> WebSocketManager:
|
| 15 |
+
"""Dependency to get WebSocket manager from app state"""
|
| 16 |
+
return request.app.state.websocket_manager
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def get_agent_service(request: Request) -> AgentService:
|
| 20 |
+
"""Dependency to get agent service from app state"""
|
| 21 |
+
return request.app.state.agent_service
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@router.get("/health", response_model=HealthResponse)
|
| 25 |
+
async def health_check(
|
| 26 |
+
websocket_manager: WebSocketManager = Depends(get_websocket_manager),
|
| 27 |
+
):
|
| 28 |
+
"""Health check endpoint"""
|
| 29 |
+
return HealthResponse(
|
| 30 |
+
status="healthy",
|
| 31 |
+
timestamp=datetime.now(),
|
| 32 |
+
websocket_connections=websocket_manager.get_connection_count(),
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@router.get("/tasks")
|
| 37 |
+
async def get_active_tasks(
|
| 38 |
+
agent_service: AgentService = Depends(get_agent_service),
|
| 39 |
+
websocket_manager: WebSocketManager = Depends(get_websocket_manager),
|
| 40 |
+
):
|
| 41 |
+
"""Get currently active tasks"""
|
| 42 |
+
return {
|
| 43 |
+
"active_tasks": agent_service.get_active_tasks(),
|
| 44 |
+
"total_connections": websocket_manager.get_connection_count(),
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@router.get("/tasks/{task_id}")
|
| 49 |
+
async def get_task_status(
|
| 50 |
+
task_id: str, agent_service: AgentService = Depends(get_agent_service)
|
| 51 |
+
):
|
| 52 |
+
"""Get status of a specific task"""
|
| 53 |
+
task_status = agent_service.get_task_status(task_id)
|
| 54 |
+
if task_status is None:
|
| 55 |
+
raise HTTPException(status_code=404, detail="Task not found")
|
| 56 |
+
return {"task_id": task_id, "status": task_status}
|
cua2-core/src/cua2-core/routes/websocket.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
from fastapi import APIRouter, WebSocket, WebSocketDisconnect
|
| 4 |
+
|
| 5 |
+
# Get services from app state
|
| 6 |
+
from backend.app import app
|
| 7 |
+
from backend.models.models import UserTaskMessage, WebSocketEvent
|
| 8 |
+
|
| 9 |
+
# Create router
|
| 10 |
+
router = APIRouter()
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@router.websocket("/ws")
|
| 14 |
+
async def websocket_endpoint(websocket: WebSocket):
|
| 15 |
+
"""WebSocket endpoint for real-time communication"""
|
| 16 |
+
|
| 17 |
+
websocket_manager = app.state.websocket_manager
|
| 18 |
+
agent_service = app.state.agent_service
|
| 19 |
+
|
| 20 |
+
await websocket_manager.connect(websocket)
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
welcome_message = WebSocketEvent(
|
| 24 |
+
type="heartbeat",
|
| 25 |
+
content="WebSocket connection established successfully",
|
| 26 |
+
messageId="connection_welcome",
|
| 27 |
+
)
|
| 28 |
+
await websocket_manager.send_personal_message(welcome_message, websocket)
|
| 29 |
+
|
| 30 |
+
# Keep the connection alive and wait for messages
|
| 31 |
+
while True:
|
| 32 |
+
try:
|
| 33 |
+
# Wait for messages from client
|
| 34 |
+
data = await websocket.receive_text()
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
# Parse the message
|
| 38 |
+
message_data = json.loads(data)
|
| 39 |
+
message = UserTaskMessage(**message_data)
|
| 40 |
+
|
| 41 |
+
# Process the user task
|
| 42 |
+
if message.type == "user_task":
|
| 43 |
+
message_id = await agent_service.process_user_task(
|
| 44 |
+
message.content, message.model_id
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# Send acknowledgment back to the client
|
| 48 |
+
response = WebSocketEvent(
|
| 49 |
+
type="agent_start",
|
| 50 |
+
content=f"Received task: {message.content}",
|
| 51 |
+
messageId=message_id,
|
| 52 |
+
)
|
| 53 |
+
await websocket_manager.send_personal_message(
|
| 54 |
+
response, websocket
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
except json.JSONDecodeError:
|
| 58 |
+
error_response = WebSocketEvent(
|
| 59 |
+
type="agent_error", content="Invalid JSON format"
|
| 60 |
+
)
|
| 61 |
+
await websocket_manager.send_personal_message(
|
| 62 |
+
error_response, websocket
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
except Exception as e:
|
| 66 |
+
print(f"Error processing message: {e}")
|
| 67 |
+
error_response = WebSocketEvent(
|
| 68 |
+
type="agent_error",
|
| 69 |
+
content=f"Error processing message: {str(e)}",
|
| 70 |
+
)
|
| 71 |
+
await websocket_manager.send_personal_message(
|
| 72 |
+
error_response, websocket
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
except Exception as e:
|
| 76 |
+
print(f"Error receiving WebSocket message: {e}")
|
| 77 |
+
# If we can't receive messages, the connection is likely broken
|
| 78 |
+
break
|
| 79 |
+
|
| 80 |
+
except WebSocketDisconnect:
|
| 81 |
+
print("WebSocket disconnected normally")
|
| 82 |
+
except Exception as e:
|
| 83 |
+
print(f"WebSocket connection error: {e}")
|
| 84 |
+
finally:
|
| 85 |
+
# Ensure cleanup happens
|
| 86 |
+
websocket_manager.disconnect(websocket)
|
cua2-core/src/cua2-core/services/agent_service.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import uuid
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
from smolagents import Model
|
| 7 |
+
|
| 8 |
+
from backend.models.models import ActiveTask, AgentMetadata
|
| 9 |
+
from backend.services.agents.get_agents import get_agent
|
| 10 |
+
from backend.services.models.get_model import get_model
|
| 11 |
+
from backend.websocket.websocket_manager import WebSocketManager
|
| 12 |
+
from computer_use_studio import Sandbox
|
| 13 |
+
from computer_use_studio.logger import get_logger
|
| 14 |
+
|
| 15 |
+
logger = get_logger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class AgentService:
|
| 19 |
+
"""Service for handling agent tasks and processing"""
|
| 20 |
+
|
| 21 |
+
def __init__(self, websocket_manager):
|
| 22 |
+
self.active_tasks: dict[str, ActiveTask] = {}
|
| 23 |
+
self.websocket_manager: WebSocketManager = websocket_manager
|
| 24 |
+
|
| 25 |
+
async def process_user_task(self, content: str, model_id: str) -> str:
|
| 26 |
+
"""Process a user task and return the message ID"""
|
| 27 |
+
|
| 28 |
+
message_id = str(uuid.uuid4())
|
| 29 |
+
while message_id in self.active_tasks.keys():
|
| 30 |
+
message_id = str(uuid.uuid4())
|
| 31 |
+
|
| 32 |
+
# Store the task
|
| 33 |
+
self.active_tasks[message_id] = ActiveTask(
|
| 34 |
+
message_id=message_id,
|
| 35 |
+
content=content,
|
| 36 |
+
model_id=model_id,
|
| 37 |
+
start_time=datetime.now(),
|
| 38 |
+
status="processing",
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# Determine the agent type based on the content of the task (TODO: implement agent type detection using LLM)
|
| 42 |
+
prompt_type = "FORM_SYSTEM_PROMPT"
|
| 43 |
+
|
| 44 |
+
# Start the agent processing in the background
|
| 45 |
+
asyncio.create_task(
|
| 46 |
+
self._simulate_agent_processing(content, model_id, message_id, prompt_type)
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
return message_id
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# async def _simulate_agent_processing(self, message_id: str, content: str):
|
| 53 |
+
# """Simulate agent processing with progress updates"""
|
| 54 |
+
# try:
|
| 55 |
+
# # Send agent start event
|
| 56 |
+
# await self.websocket_manager.send_agent_start(
|
| 57 |
+
# content=f"Starting task: {content}", message_id=message_id
|
| 58 |
+
# )
|
| 59 |
+
#
|
| 60 |
+
# # Simulate processing steps
|
| 61 |
+
# steps = [
|
| 62 |
+
# "Analyzing task requirements...",
|
| 63 |
+
# "Planning execution steps...",
|
| 64 |
+
# "Initializing computer interface...",
|
| 65 |
+
# "Executing task commands...",
|
| 66 |
+
# "Verifying results...",
|
| 67 |
+
# "Finalizing task completion...",
|
| 68 |
+
# ]
|
| 69 |
+
#
|
| 70 |
+
# for i, step in enumerate(steps):
|
| 71 |
+
# await asyncio.sleep(2) # Simulate processing time
|
| 72 |
+
#
|
| 73 |
+
# # Send progress update
|
| 74 |
+
# await self.websocket_manager.send_agent_progress(
|
| 75 |
+
# content=f"{step} ({i + 1}/{len(steps)})", message_id=message_id
|
| 76 |
+
# )
|
| 77 |
+
#
|
| 78 |
+
# # Simulate VNC URL events during processing
|
| 79 |
+
# if i == 2: # After "Initializing computer interface..."
|
| 80 |
+
# # Set VNC URL when computer interface is ready
|
| 81 |
+
# vnc_url = "http://localhost:6080/vnc.html?host=localhost&port=5900&autoconnect=true"
|
| 82 |
+
# await self.websocket_manager.send_vnc_url_set(
|
| 83 |
+
# vnc_url=vnc_url,
|
| 84 |
+
# content="Computer interface ready, VNC stream connected",
|
| 85 |
+
# )
|
| 86 |
+
# elif i == 4: # After "Verifying results..."
|
| 87 |
+
# # Unset VNC URL when task is almost complete
|
| 88 |
+
# await self.websocket_manager.send_vnc_url_unset(
|
| 89 |
+
# content="Task verification complete, disconnecting VNC stream"
|
| 90 |
+
# )
|
| 91 |
+
#
|
| 92 |
+
# # Calculate metadata
|
| 93 |
+
# end_time = datetime.now()
|
| 94 |
+
# start_time = self.active_tasks[message_id]["start_time"]
|
| 95 |
+
# time_taken = (end_time - start_time).total_seconds()
|
| 96 |
+
#
|
| 97 |
+
# metadata = AgentMetadata(
|
| 98 |
+
# tokensUsed=150 + len(content) * 2, # Simulate token usage
|
| 99 |
+
# timeTaken=time_taken,
|
| 100 |
+
# numberOfSteps=len(steps),
|
| 101 |
+
# )
|
| 102 |
+
#
|
| 103 |
+
# # Send completion event
|
| 104 |
+
# await self.websocket_manager.send_agent_complete(
|
| 105 |
+
# content=f"Task completed successfully: {content}",
|
| 106 |
+
# message_id=message_id,
|
| 107 |
+
# metadata=metadata,
|
| 108 |
+
# )
|
| 109 |
+
#
|
| 110 |
+
# # Clean up
|
| 111 |
+
# if message_id in self.active_tasks:
|
| 112 |
+
# del self.active_tasks[message_id]
|
| 113 |
+
#
|
| 114 |
+
# except Exception as e:
|
| 115 |
+
# # Send error event
|
| 116 |
+
# await self.websocket_manager.send_agent_error(
|
| 117 |
+
# content=f"Error processing task: {str(e)}", message_id=message_id
|
| 118 |
+
# )
|
| 119 |
+
#
|
| 120 |
+
# # Clean up
|
| 121 |
+
# if message_id in self.active_tasks:
|
| 122 |
+
# del self.active_tasks[message_id]
|
| 123 |
+
|
| 124 |
+
def get_active_tasks(self) -> dict:
|
| 125 |
+
"""Get currently active tasks"""
|
| 126 |
+
return self.active_tasks.copy()
|
| 127 |
+
|
| 128 |
+
def get_task_status(self, message_id: str) -> Optional[dict]:
|
| 129 |
+
"""Get status of a specific task"""
|
| 130 |
+
return self.active_tasks.get(message_id)
|
cua2-core/src/cua2-core/services/agents/get_agents.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Annotated, TypeAlias
|
| 2 |
+
|
| 3 |
+
from pydantic import Field
|
| 4 |
+
from smolagents import Model
|
| 5 |
+
|
| 6 |
+
from backend.models.models import AgentType
|
| 7 |
+
from backend.services.agents.normalized_1000_agent import Normalized1000Agent
|
| 8 |
+
from backend.services.agents.normalized_agent import NormalizedAgent
|
| 9 |
+
from backend.services.agents.pixel_coordonates_agent import PixelCoordinatesAgent
|
| 10 |
+
from backend.services.agents.prompt import (
|
| 11 |
+
Normalized1000CoordinatesSystemPrompt,
|
| 12 |
+
NormalizedCoordinatesSystemPrompt,
|
| 13 |
+
PixelCoordinatesSystemPrompt,
|
| 14 |
+
)
|
| 15 |
+
from computer_use_studio import Sandbox
|
| 16 |
+
|
| 17 |
+
Agent: TypeAlias = Annotated[
|
| 18 |
+
PixelCoordinatesAgent | Normalized1000Agent | NormalizedAgent,
|
| 19 |
+
Field(discriminator="AGENT_TYPE"),
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def get_agent(
|
| 24 |
+
model: Model,
|
| 25 |
+
desktop: Sandbox,
|
| 26 |
+
agent_type: AgentType,
|
| 27 |
+
prompt_type: str,
|
| 28 |
+
data_dir: str,
|
| 29 |
+
**kwargs,
|
| 30 |
+
) -> Agent:
|
| 31 |
+
"""Get the agent by type"""
|
| 32 |
+
if agent_type == AgentType.PIXEL_COORDINATES:
|
| 33 |
+
return PixelCoordinatesAgent(
|
| 34 |
+
model=model,
|
| 35 |
+
desktop=desktop,
|
| 36 |
+
system_prompt=PixelCoordinatesSystemPrompt[prompt_type].value,
|
| 37 |
+
data_dir=data_dir,
|
| 38 |
+
**kwargs,
|
| 39 |
+
)
|
| 40 |
+
elif agent_type == AgentType.NORMALIZED_1000_COORDINATES:
|
| 41 |
+
return Normalized1000Agent(
|
| 42 |
+
model=model,
|
| 43 |
+
desktop=desktop,
|
| 44 |
+
system_prompt=Normalized1000CoordinatesSystemPrompt[prompt_type].value,
|
| 45 |
+
data_dir=data_dir,
|
| 46 |
+
**kwargs,
|
| 47 |
+
)
|
| 48 |
+
elif agent_type == AgentType.NORMALIZED_COORDINATES:
|
| 49 |
+
return Normalized1000Agent(
|
| 50 |
+
model=model,
|
| 51 |
+
desktop=desktop,
|
| 52 |
+
system_prompt=NormalizedCoordinatesSystemPrompt[prompt_type].value,
|
| 53 |
+
data_dir=data_dir,
|
| 54 |
+
**kwargs,
|
| 55 |
+
)
|
| 56 |
+
else:
|
| 57 |
+
raise ValueError(f"Invalid agent type: {agent_type}")
|
cua2-core/src/cua2-core/services/agents/normalized_1000_agent.py
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import unicodedata
|
| 3 |
+
from typing import List, Literal
|
| 4 |
+
|
| 5 |
+
# SmolaAgents imports
|
| 6 |
+
from smolagents import Model, Tool, tool
|
| 7 |
+
from smolagents.monitoring import LogLevel
|
| 8 |
+
|
| 9 |
+
from backend.models.models import AgentType
|
| 10 |
+
from backend.services.agents.prompt import Normalized1000CoordinatesSystemPrompt
|
| 11 |
+
from computer_use_studio import DesktopAgentBase, Sandbox
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class Normalized1000Agent(DesktopAgentBase):
|
| 15 |
+
"""Agent for desktop automation with normalized coordinates (0 to 1000)"""
|
| 16 |
+
|
| 17 |
+
AGENT_TYPE = AgentType.NORMALIZED_1000_COORDINATES
|
| 18 |
+
|
| 19 |
+
def __init__(
|
| 20 |
+
self,
|
| 21 |
+
model: Model,
|
| 22 |
+
data_dir: str,
|
| 23 |
+
desktop: Sandbox,
|
| 24 |
+
system_prompt: Normalized1000CoordinatesSystemPrompt,
|
| 25 |
+
tools: List[Tool] | None = None,
|
| 26 |
+
max_steps: int = 20,
|
| 27 |
+
verbosity_level: LogLevel = LogLevel.INFO,
|
| 28 |
+
planning_interval: int | None = None,
|
| 29 |
+
use_v1_prompt: bool = False,
|
| 30 |
+
**kwargs,
|
| 31 |
+
):
|
| 32 |
+
super().__init__(
|
| 33 |
+
model=model,
|
| 34 |
+
data_dir=data_dir,
|
| 35 |
+
desktop=desktop,
|
| 36 |
+
system_prompt=system_prompt,
|
| 37 |
+
tools=tools,
|
| 38 |
+
max_steps=max_steps,
|
| 39 |
+
verbosity_level=verbosity_level,
|
| 40 |
+
planning_interval=planning_interval,
|
| 41 |
+
use_v1_prompt=use_v1_prompt,
|
| 42 |
+
**kwargs,
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
def _normalize_to_pixel(self, norm_x: int, norm_y: int) -> tuple[int, int]:
|
| 46 |
+
"""
|
| 47 |
+
Convert normalized coordinates (0-1000) to pixel coordinates
|
| 48 |
+
Args:
|
| 49 |
+
norm_x: Normalized x coordinate (0 to 1000)
|
| 50 |
+
norm_y: Normalized y coordinate (0 to 1000)
|
| 51 |
+
Returns:
|
| 52 |
+
Tuple of (pixel_x, pixel_y)
|
| 53 |
+
"""
|
| 54 |
+
# Clamp values to valid range
|
| 55 |
+
norm_x = max(0, min(1000, norm_x))
|
| 56 |
+
norm_y = max(0, min(1000, norm_y))
|
| 57 |
+
|
| 58 |
+
# Convert from 0-1000 range to 0-1 range, then to pixels
|
| 59 |
+
norm_x_float = norm_x / 1000.0
|
| 60 |
+
norm_y_float = norm_y / 1000.0
|
| 61 |
+
|
| 62 |
+
pixel_x = int(norm_x_float * self.width)
|
| 63 |
+
pixel_y = int(norm_y_float * self.height)
|
| 64 |
+
|
| 65 |
+
# Ensure we don't go outside screen bounds
|
| 66 |
+
pixel_x = max(0, min(self.width - 1, pixel_x))
|
| 67 |
+
pixel_y = max(0, min(self.height - 1, pixel_y))
|
| 68 |
+
|
| 69 |
+
return pixel_x, pixel_y
|
| 70 |
+
|
| 71 |
+
def _setup_desktop_tools(self):
|
| 72 |
+
"""Register all desktop tools with normalized coordinate support (0-1000)"""
|
| 73 |
+
|
| 74 |
+
@tool
|
| 75 |
+
def click(x: int, y: int) -> str:
|
| 76 |
+
"""
|
| 77 |
+
Performs a left-click at the specified normalized coordinates
|
| 78 |
+
Args:
|
| 79 |
+
x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
|
| 80 |
+
y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
|
| 81 |
+
"""
|
| 82 |
+
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 83 |
+
self.desktop.left_click(pixel_x, pixel_y)
|
| 84 |
+
self.click_coordinates = (pixel_x, pixel_y)
|
| 85 |
+
self.logger.log(
|
| 86 |
+
f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 87 |
+
)
|
| 88 |
+
time.sleep(1)
|
| 89 |
+
return f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 90 |
+
|
| 91 |
+
@tool
|
| 92 |
+
def right_click(x: int, y: int) -> str:
|
| 93 |
+
"""
|
| 94 |
+
Performs a right-click at the specified normalized coordinates
|
| 95 |
+
Args:
|
| 96 |
+
x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
|
| 97 |
+
y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
|
| 98 |
+
"""
|
| 99 |
+
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 100 |
+
self.desktop.right_click(pixel_x, pixel_y)
|
| 101 |
+
self.click_coordinates = (pixel_x, pixel_y)
|
| 102 |
+
self.logger.log(
|
| 103 |
+
f"Right-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 104 |
+
)
|
| 105 |
+
return f"Right-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 106 |
+
|
| 107 |
+
@tool
|
| 108 |
+
def double_click(x: int, y: int) -> str:
|
| 109 |
+
"""
|
| 110 |
+
Performs a double-click at the specified normalized coordinates
|
| 111 |
+
Args:
|
| 112 |
+
x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
|
| 113 |
+
y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
|
| 114 |
+
"""
|
| 115 |
+
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 116 |
+
self.desktop.double_click(pixel_x, pixel_y)
|
| 117 |
+
self.click_coordinates = (pixel_x, pixel_y)
|
| 118 |
+
self.logger.log(
|
| 119 |
+
f"Double-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 120 |
+
)
|
| 121 |
+
return f"Double-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 122 |
+
|
| 123 |
+
@tool
|
| 124 |
+
def move_mouse(x: int, y: int) -> str:
|
| 125 |
+
"""
|
| 126 |
+
Moves the mouse cursor to the specified normalized coordinates
|
| 127 |
+
Args:
|
| 128 |
+
x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
|
| 129 |
+
y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
|
| 130 |
+
"""
|
| 131 |
+
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 132 |
+
self.desktop.move_mouse(pixel_x, pixel_y)
|
| 133 |
+
self.logger.log(
|
| 134 |
+
f"Moved mouse to normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 135 |
+
)
|
| 136 |
+
return f"Moved mouse to normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 137 |
+
|
| 138 |
+
def normalize_text(text):
|
| 139 |
+
return "".join(
|
| 140 |
+
c
|
| 141 |
+
for c in unicodedata.normalize("NFD", text)
|
| 142 |
+
if not unicodedata.combining(c)
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
@tool
|
| 146 |
+
def write(text: str) -> str:
|
| 147 |
+
"""
|
| 148 |
+
Types the specified text at the current cursor position.
|
| 149 |
+
Args:
|
| 150 |
+
text: The text to type
|
| 151 |
+
"""
|
| 152 |
+
# clean_text = normalize_text(text)
|
| 153 |
+
self.desktop.write(text, delay_in_ms=10)
|
| 154 |
+
self.logger.log(f"Typed text: '{text}'")
|
| 155 |
+
time.sleep(1)
|
| 156 |
+
return f"Typed text: '{text}'"
|
| 157 |
+
|
| 158 |
+
@tool
|
| 159 |
+
def press(key: str) -> str:
|
| 160 |
+
"""
|
| 161 |
+
Presses a keyboard key or combination of keys
|
| 162 |
+
Args:
|
| 163 |
+
key: The key to press (e.g. "enter", "space", "backspace", etc.) or a multiple keys string to press, for example "ctrl+a" or "ctrl+shift+a".
|
| 164 |
+
"""
|
| 165 |
+
self.desktop.press(key)
|
| 166 |
+
self.logger.log(f"Pressed key: {key}")
|
| 167 |
+
time.sleep(0.1)
|
| 168 |
+
return f"Pressed key: {key}"
|
| 169 |
+
|
| 170 |
+
@tool
|
| 171 |
+
def drag(x1: int, y1: int, x2: int, y2: int) -> str:
|
| 172 |
+
"""
|
| 173 |
+
Clicks at normalized coordinates [x1, y1], drags mouse to [x2, y2], then release click.
|
| 174 |
+
Args:
|
| 175 |
+
x1: origin normalized x coordinate (0 to 1000)
|
| 176 |
+
y1: origin normalized y coordinate (0 to 1000)
|
| 177 |
+
x2: end normalized x coordinate (0 to 1000)
|
| 178 |
+
y2: end normalized y coordinate (0 to 1000)
|
| 179 |
+
"""
|
| 180 |
+
pixel_x1, pixel_y1 = self._normalize_to_pixel(x1, y1)
|
| 181 |
+
pixel_x2, pixel_y2 = self._normalize_to_pixel(x2, y2)
|
| 182 |
+
self.desktop.drag((pixel_x1, pixel_y1), (pixel_x2, pixel_y2))
|
| 183 |
+
message = f"Dragged and dropped from normalized [{x1}, {y1}] to [{x2}, {y2}] -> pixels [{pixel_x1}, {pixel_y1}] to [{pixel_x2}, {pixel_y2}]"
|
| 184 |
+
self.logger.log(message)
|
| 185 |
+
return message
|
| 186 |
+
|
| 187 |
+
@tool
|
| 188 |
+
def scroll(
|
| 189 |
+
x: int,
|
| 190 |
+
y: int,
|
| 191 |
+
direction: Literal["up", "down"] = "down",
|
| 192 |
+
amount: int = 2,
|
| 193 |
+
) -> str:
|
| 194 |
+
"""
|
| 195 |
+
Moves the mouse to selected normalized coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
|
| 196 |
+
Args:
|
| 197 |
+
x: The normalized x coordinate (0 to 1000) of the element to scroll/zoom
|
| 198 |
+
y: The normalized y coordinate (0 to 1000) of the element to scroll/zoom
|
| 199 |
+
direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
|
| 200 |
+
amount: The amount to scroll. A good amount is 1 or 2.
|
| 201 |
+
"""
|
| 202 |
+
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 203 |
+
self.desktop.move_mouse(pixel_x, pixel_y)
|
| 204 |
+
self.desktop.scroll(direction=direction, amount=amount)
|
| 205 |
+
message = f"Scrolled {direction} by {amount} at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 206 |
+
self.logger.log(message)
|
| 207 |
+
return message
|
| 208 |
+
|
| 209 |
+
@tool
|
| 210 |
+
def wait(seconds: float) -> str:
|
| 211 |
+
"""
|
| 212 |
+
Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
|
| 213 |
+
Args:
|
| 214 |
+
seconds: Number of seconds to wait, generally 3 is enough.
|
| 215 |
+
"""
|
| 216 |
+
time.sleep(seconds)
|
| 217 |
+
self.logger.log(f"Waited for {seconds} seconds")
|
| 218 |
+
return f"Waited for {seconds} seconds"
|
| 219 |
+
|
| 220 |
+
@tool
|
| 221 |
+
def open(file_or_url: str) -> str:
|
| 222 |
+
"""
|
| 223 |
+
Directly opens a browser with the specified url or opens a file with the default application: use this at start of web searches rather than trying to click the browser or open a file by clicking.
|
| 224 |
+
Args:
|
| 225 |
+
file_or_url: The URL or file to open
|
| 226 |
+
"""
|
| 227 |
+
|
| 228 |
+
self.desktop.open(file_or_url)
|
| 229 |
+
# Give it time to load
|
| 230 |
+
time.sleep(2)
|
| 231 |
+
self.logger.log(f"Opening: {file_or_url}")
|
| 232 |
+
return f"Opened: {file_or_url}"
|
| 233 |
+
|
| 234 |
+
@tool
|
| 235 |
+
def launch_app(app_name: str) -> str:
|
| 236 |
+
"""
|
| 237 |
+
Launches the specified application.
|
| 238 |
+
Args:
|
| 239 |
+
app_name: the name of the application to launch
|
| 240 |
+
"""
|
| 241 |
+
self.desktop.launch(app_name)
|
| 242 |
+
self.logger.log(f"Launched app: {app_name}")
|
| 243 |
+
return f"Launched app: {app_name}"
|
| 244 |
+
|
| 245 |
+
@tool
|
| 246 |
+
def execute(command: str) -> str:
|
| 247 |
+
"""
|
| 248 |
+
Executes a terminal command in the desktop environment.
|
| 249 |
+
Args:
|
| 250 |
+
command: The command to execute
|
| 251 |
+
"""
|
| 252 |
+
self.desktop.execute_command(command)
|
| 253 |
+
self.logger.log(f"Executed command: {command}")
|
| 254 |
+
return f"Executed command: {command}"
|
| 255 |
+
|
| 256 |
+
@tool
|
| 257 |
+
def refresh() -> str:
|
| 258 |
+
"""
|
| 259 |
+
Refreshes the current web page if you're in a browser.
|
| 260 |
+
"""
|
| 261 |
+
self.desktop.press(["ctrl", "r"])
|
| 262 |
+
self.logger.log("Refreshed the current page")
|
| 263 |
+
return "Refreshed the current page"
|
| 264 |
+
|
| 265 |
+
@tool
|
| 266 |
+
def go_back() -> str:
|
| 267 |
+
"""
|
| 268 |
+
Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
|
| 269 |
+
Args:
|
| 270 |
+
"""
|
| 271 |
+
self.desktop.press(["alt", "left"])
|
| 272 |
+
self.logger.log("Went back one page")
|
| 273 |
+
return "Went back one page"
|
| 274 |
+
|
| 275 |
+
# Register the tools
|
| 276 |
+
self.tools["click"] = click
|
| 277 |
+
self.tools["right_click"] = right_click
|
| 278 |
+
self.tools["double_click"] = double_click
|
| 279 |
+
self.tools["move_mouse"] = move_mouse
|
| 280 |
+
self.tools["write"] = write
|
| 281 |
+
self.tools["press"] = press
|
| 282 |
+
self.tools["scroll"] = scroll
|
| 283 |
+
self.tools["wait"] = wait
|
| 284 |
+
self.tools["open"] = open
|
| 285 |
+
self.tools["go_back"] = go_back
|
| 286 |
+
self.tools["drag"] = drag
|
| 287 |
+
self.tools["launch_app"] = launch_app
|
| 288 |
+
self.tools["execute"] = execute
|
| 289 |
+
self.tools["refresh"] = refresh
|
| 290 |
+
self.tools["refresh"] = refresh
|
| 291 |
+
self.tools["execute"] = execute
|
| 292 |
+
self.tools["refresh"] = refresh
|
| 293 |
+
self.tools["refresh"] = refresh
|
cua2-core/src/cua2-core/services/agents/normalized_agent.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import unicodedata
|
| 3 |
+
from typing import List, Literal
|
| 4 |
+
|
| 5 |
+
# SmolaAgents imports
|
| 6 |
+
from smolagents import Model, Tool, tool
|
| 7 |
+
from smolagents.monitoring import LogLevel
|
| 8 |
+
|
| 9 |
+
from backend.models.models import AgentType
|
| 10 |
+
from backend.services.agents.prompt import NormalizedCoordinatesSystemPrompt
|
| 11 |
+
from computer_use_studio import DesktopAgentBase, Sandbox
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class NormalizedAgent(DesktopAgentBase):
|
| 15 |
+
"""Agent for desktop automation with normalized coordinates (0.0 to 1.0)"""
|
| 16 |
+
|
| 17 |
+
AGENT_TYPE = AgentType.NORMALIZED_COORDINATES
|
| 18 |
+
|
| 19 |
+
def __init__(
|
| 20 |
+
self,
|
| 21 |
+
model: Model,
|
| 22 |
+
data_dir: str,
|
| 23 |
+
desktop: Sandbox,
|
| 24 |
+
system_prompt: NormalizedCoordinatesSystemPrompt,
|
| 25 |
+
tools: List[Tool] | None = None,
|
| 26 |
+
max_steps: int = 20,
|
| 27 |
+
verbosity_level: LogLevel = LogLevel.INFO,
|
| 28 |
+
planning_interval: int | None = None,
|
| 29 |
+
use_v1_prompt: bool = False,
|
| 30 |
+
**kwargs,
|
| 31 |
+
):
|
| 32 |
+
super().__init__(
|
| 33 |
+
model=model,
|
| 34 |
+
data_dir=data_dir,
|
| 35 |
+
desktop=desktop,
|
| 36 |
+
system_prompt=system_prompt,
|
| 37 |
+
tools=tools,
|
| 38 |
+
max_steps=max_steps,
|
| 39 |
+
verbosity_level=verbosity_level,
|
| 40 |
+
planning_interval=planning_interval,
|
| 41 |
+
use_v1_prompt=use_v1_prompt,
|
| 42 |
+
**kwargs,
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
def _normalize_to_pixel(self, norm_x: float, norm_y: float) -> tuple[int, int]:
|
| 46 |
+
"""
|
| 47 |
+
Convert normalized coordinates (0.0-1.0) to pixel coordinates
|
| 48 |
+
Args:
|
| 49 |
+
norm_x: Normalized x coordinate (0.0 to 1.0)
|
| 50 |
+
norm_y: Normalized y coordinate (0.0 to 1.0)
|
| 51 |
+
Returns:
|
| 52 |
+
Tuple of (pixel_x, pixel_y)
|
| 53 |
+
"""
|
| 54 |
+
# Clamp values to valid range
|
| 55 |
+
norm_x = max(0.0, min(1.0, norm_x))
|
| 56 |
+
norm_y = max(0.0, min(1.0, norm_y))
|
| 57 |
+
|
| 58 |
+
pixel_x = int(norm_x * self.width)
|
| 59 |
+
pixel_y = int(norm_y * self.height)
|
| 60 |
+
|
| 61 |
+
# Ensure we don't go outside screen bounds
|
| 62 |
+
pixel_x = max(0, min(self.width - 1, pixel_x))
|
| 63 |
+
pixel_y = max(0, min(self.height - 1, pixel_y))
|
| 64 |
+
|
| 65 |
+
return pixel_x, pixel_y
|
| 66 |
+
|
| 67 |
+
def _setup_desktop_tools(self):
|
| 68 |
+
"""Register all desktop tools with normalized coordinate support"""
|
| 69 |
+
|
| 70 |
+
@tool
|
| 71 |
+
def click(x: float, y: float) -> str:
|
| 72 |
+
"""
|
| 73 |
+
Performs a left-click at the specified normalized coordinates
|
| 74 |
+
Args:
|
| 75 |
+
x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
|
| 76 |
+
y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
|
| 77 |
+
"""
|
| 78 |
+
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 79 |
+
self.desktop.left_click(pixel_x, pixel_y)
|
| 80 |
+
self.click_coordinates = (pixel_x, pixel_y)
|
| 81 |
+
self.logger.log(
|
| 82 |
+
f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 83 |
+
)
|
| 84 |
+
return f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
|
| 85 |
+
|
| 86 |
+
@tool
|
| 87 |
+
def right_click(x: float, y: float) -> str:
|
| 88 |
+
"""
|
| 89 |
+
Performs a right-click at the specified normalized coordinates
|
| 90 |
+
Args:
|
| 91 |
+
x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
|
| 92 |
+
y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
|
| 93 |
+
"""
|
| 94 |
+
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 95 |
+
self.desktop.right_click(pixel_x, pixel_y)
|
| 96 |
+
self.click_coordinates = (pixel_x, pixel_y)
|
| 97 |
+
self.logger.log(
|
| 98 |
+
f"Right-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
|
| 99 |
+
)
|
| 100 |
+
return f"Right-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
|
| 101 |
+
|
| 102 |
+
@tool
|
| 103 |
+
def double_click(x: float, y: float) -> str:
|
| 104 |
+
"""
|
| 105 |
+
Performs a double-click at the specified normalized coordinates
|
| 106 |
+
Args:
|
| 107 |
+
x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
|
| 108 |
+
y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
|
| 109 |
+
"""
|
| 110 |
+
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 111 |
+
self.desktop.double_click(pixel_x, pixel_y)
|
| 112 |
+
self.click_coordinates = (pixel_x, pixel_y)
|
| 113 |
+
self.logger.log(
|
| 114 |
+
f"Double-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
|
| 115 |
+
)
|
| 116 |
+
return f"Double-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
|
| 117 |
+
|
| 118 |
+
@tool
|
| 119 |
+
def move_mouse(x: float, y: float) -> str:
|
| 120 |
+
"""
|
| 121 |
+
Moves the mouse cursor to the specified normalized coordinates
|
| 122 |
+
Args:
|
| 123 |
+
x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
|
| 124 |
+
y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
|
| 125 |
+
"""
|
| 126 |
+
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 127 |
+
self.desktop.move_mouse(pixel_x, pixel_y)
|
| 128 |
+
self.logger.log(
|
| 129 |
+
f"Moved mouse to normalized coordinates ({pixel_x}, {pixel_y})"
|
| 130 |
+
)
|
| 131 |
+
return f"Moved mouse to normalized coordinates ({pixel_x}, {pixel_y})"
|
| 132 |
+
|
| 133 |
+
def normalize_text(text):
|
| 134 |
+
return "".join(
|
| 135 |
+
c
|
| 136 |
+
for c in unicodedata.normalize("NFD", text)
|
| 137 |
+
if not unicodedata.combining(c)
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
@tool
|
| 141 |
+
def write(text: str) -> str:
|
| 142 |
+
"""
|
| 143 |
+
Types the specified text at the current cursor position.
|
| 144 |
+
Args:
|
| 145 |
+
text: The text to type
|
| 146 |
+
"""
|
| 147 |
+
# clean_text = normalize_text(text)
|
| 148 |
+
self.desktop.write(text, delay_in_ms=10)
|
| 149 |
+
self.logger.log(f"Typed text: '{text}'")
|
| 150 |
+
return f"Typed text: '{text}'"
|
| 151 |
+
|
| 152 |
+
@tool
|
| 153 |
+
def press(key: str) -> str:
|
| 154 |
+
"""
|
| 155 |
+
Presses a keyboard key or combination of keys
|
| 156 |
+
Args:
|
| 157 |
+
key: The key to press (e.g. "enter", "space", "backspace", etc.) or a multiple keys string to press, for example "ctrl+a" or "ctrl+shift+a".
|
| 158 |
+
"""
|
| 159 |
+
self.desktop.press(key)
|
| 160 |
+
self.logger.log(f"Pressed key: {key}")
|
| 161 |
+
return f"Pressed key: {key}"
|
| 162 |
+
|
| 163 |
+
@tool
|
| 164 |
+
def drag(x1: float, y1: float, x2: float, y2: float) -> str:
|
| 165 |
+
"""
|
| 166 |
+
Clicks at normalized coordinates [x1, y1], drags mouse to [x2, y2], then release click.
|
| 167 |
+
Args:
|
| 168 |
+
x1: origin normalized x coordinate (0.0 to 1.0)
|
| 169 |
+
y1: origin normalized y coordinate (0.0 to 1.0)
|
| 170 |
+
x2: end normalized x coordinate (0.0 to 1.0)
|
| 171 |
+
y2: end normalized y coordinate (0.0 to 1.0)
|
| 172 |
+
"""
|
| 173 |
+
pixel_x1, pixel_y1 = self._normalize_to_pixel(x1, y1)
|
| 174 |
+
pixel_x2, pixel_y2 = self._normalize_to_pixel(x2, y2)
|
| 175 |
+
self.desktop.drag((pixel_x1, pixel_y1), (pixel_x2, pixel_y2))
|
| 176 |
+
message = f"Dragged and dropped from normalized [{pixel_x1}, {pixel_y1}] to [{pixel_x2}, {pixel_y2}]"
|
| 177 |
+
self.logger.log(message)
|
| 178 |
+
return message
|
| 179 |
+
|
| 180 |
+
@tool
|
| 181 |
+
def scroll(
|
| 182 |
+
x: float,
|
| 183 |
+
y: float,
|
| 184 |
+
direction: Literal["up", "down"] = "down",
|
| 185 |
+
amount: int = 2,
|
| 186 |
+
) -> str:
|
| 187 |
+
"""
|
| 188 |
+
Moves the mouse to selected normalized coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
|
| 189 |
+
Args:
|
| 190 |
+
x: The normalized x coordinate (0.0 to 1.0) of the element to scroll/zoom
|
| 191 |
+
y: The normalized y coordinate (0.0 to 1.0) of the element to scroll/zoom
|
| 192 |
+
direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
|
| 193 |
+
amount: The amount to scroll. A good amount is 1 or 2.
|
| 194 |
+
"""
|
| 195 |
+
pixel_x, pixel_y = self._normalize_to_pixel(x, y)
|
| 196 |
+
self.desktop.move_mouse(pixel_x, pixel_y)
|
| 197 |
+
self.desktop.scroll(direction=direction, amount=amount)
|
| 198 |
+
message = f"Scrolled {direction} by {amount} at normalized coordinates ({pixel_x}, {pixel_y})"
|
| 199 |
+
self.logger.log(message)
|
| 200 |
+
return message
|
| 201 |
+
|
| 202 |
+
@tool
|
| 203 |
+
def wait(seconds: float) -> str:
|
| 204 |
+
"""
|
| 205 |
+
Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
|
| 206 |
+
Args:
|
| 207 |
+
seconds: Number of seconds to wait, generally 3 is enough.
|
| 208 |
+
"""
|
| 209 |
+
time.sleep(seconds)
|
| 210 |
+
self.logger.log(f"Waited for {seconds} seconds")
|
| 211 |
+
return f"Waited for {seconds} seconds"
|
| 212 |
+
|
| 213 |
+
@tool
|
| 214 |
+
def open(file_or_url: str) -> str:
|
| 215 |
+
"""
|
| 216 |
+
Directly opens a browser with the specified url or opens a file with the default application: use this at start of web searches rather than trying to click the browser or open a file by clicking.
|
| 217 |
+
Args:
|
| 218 |
+
file_or_url: The URL or file to open
|
| 219 |
+
"""
|
| 220 |
+
|
| 221 |
+
self.desktop.open(file_or_url)
|
| 222 |
+
# Give it time to load
|
| 223 |
+
time.sleep(2)
|
| 224 |
+
self.logger.log(f"Opening: {file_or_url}")
|
| 225 |
+
return f"Opened: {file_or_url}"
|
| 226 |
+
|
| 227 |
+
@tool
|
| 228 |
+
def launch_app(app_name: str) -> str:
|
| 229 |
+
"""
|
| 230 |
+
Launches the specified application.
|
| 231 |
+
Args:
|
| 232 |
+
app_name: the name of the application to launch
|
| 233 |
+
"""
|
| 234 |
+
self.desktop.launch(app_name)
|
| 235 |
+
self.logger.log(f"Launched app: {app_name}")
|
| 236 |
+
return f"Launched app: {app_name}"
|
| 237 |
+
|
| 238 |
+
@tool
|
| 239 |
+
def execute(command: str) -> str:
|
| 240 |
+
"""
|
| 241 |
+
Executes a terminal command in the desktop environment.
|
| 242 |
+
Args:
|
| 243 |
+
command: The command to execute
|
| 244 |
+
"""
|
| 245 |
+
self.desktop.execute_command(command)
|
| 246 |
+
self.logger.log(f"Executed command: {command}")
|
| 247 |
+
return f"Executed command: {command}"
|
| 248 |
+
|
| 249 |
+
@tool
|
| 250 |
+
def refresh() -> str:
|
| 251 |
+
"""
|
| 252 |
+
Refreshes the current web page if you're in a browser.
|
| 253 |
+
"""
|
| 254 |
+
self.desktop.press(["ctrl", "r"])
|
| 255 |
+
self.logger.log("Refreshed the current page")
|
| 256 |
+
return "Refreshed the current page"
|
| 257 |
+
|
| 258 |
+
@tool
|
| 259 |
+
def go_back() -> str:
|
| 260 |
+
"""
|
| 261 |
+
Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
|
| 262 |
+
Args:
|
| 263 |
+
"""
|
| 264 |
+
self.desktop.press(["alt", "left"])
|
| 265 |
+
self.logger.log("Went back one page")
|
| 266 |
+
return "Went back one page"
|
| 267 |
+
|
| 268 |
+
# Register the tools
|
| 269 |
+
self.tools["click"] = click
|
| 270 |
+
self.tools["right_click"] = right_click
|
| 271 |
+
self.tools["double_click"] = double_click
|
| 272 |
+
self.tools["move_mouse"] = move_mouse
|
| 273 |
+
self.tools["write"] = write
|
| 274 |
+
self.tools["press"] = press
|
| 275 |
+
self.tools["scroll"] = scroll
|
| 276 |
+
self.tools["wait"] = wait
|
| 277 |
+
self.tools["open"] = open
|
| 278 |
+
self.tools["go_back"] = go_back
|
| 279 |
+
self.tools["drag"] = drag
|
| 280 |
+
self.tools["launch_app"] = launch_app
|
| 281 |
+
self.tools["execute"] = execute
|
| 282 |
+
self.tools["refresh"] = refresh
|
cua2-core/src/cua2-core/services/agents/pixel_coordonates_agent.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import unicodedata
|
| 3 |
+
from typing import List, Literal
|
| 4 |
+
|
| 5 |
+
# SmolaAgents imports
|
| 6 |
+
from smolagents import Model, Tool, tool
|
| 7 |
+
from smolagents.monitoring import LogLevel
|
| 8 |
+
|
| 9 |
+
from backend.models.models import AgentType
|
| 10 |
+
from backend.services.agents.prompt import PixelCoordinatesSystemPrompt
|
| 11 |
+
from computer_use_studio import DesktopAgentBase, Sandbox
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class PixelCoordinatesAgent(DesktopAgentBase):
|
| 15 |
+
"""Agent for desktop automation"""
|
| 16 |
+
|
| 17 |
+
AGENT_TYPE = AgentType.PIXEL_COORDINATES
|
| 18 |
+
|
| 19 |
+
def __init__(
|
| 20 |
+
self,
|
| 21 |
+
model: Model,
|
| 22 |
+
data_dir: str,
|
| 23 |
+
desktop: Sandbox,
|
| 24 |
+
system_prompt: PixelCoordinatesSystemPrompt,
|
| 25 |
+
tools: List[Tool] | None = None,
|
| 26 |
+
max_steps: int = 20,
|
| 27 |
+
verbosity_level: LogLevel = LogLevel.INFO,
|
| 28 |
+
planning_interval: int | None = None,
|
| 29 |
+
use_v1_prompt: bool = False,
|
| 30 |
+
**kwargs,
|
| 31 |
+
):
|
| 32 |
+
super().__init__(
|
| 33 |
+
model=model,
|
| 34 |
+
data_dir=data_dir,
|
| 35 |
+
desktop=desktop,
|
| 36 |
+
system_prompt=system_prompt,
|
| 37 |
+
tools=tools,
|
| 38 |
+
max_steps=max_steps,
|
| 39 |
+
verbosity_level=verbosity_level,
|
| 40 |
+
planning_interval=planning_interval,
|
| 41 |
+
use_v1_prompt=use_v1_prompt,
|
| 42 |
+
**kwargs,
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# OPTIONAL: Add a custom prompt template - see src/computer_use_studio/desktop_agent/desktop_agent_base.py for more details about the default prompt template
|
| 46 |
+
# self.prompt_templates["system_prompt"] = CUSTOM_PROMPT_TEMPLATE.replace(
|
| 47 |
+
# "<<resolution_x>>", str(self.width)
|
| 48 |
+
# ).replace("<<resolution_y>>", str(self.height))
|
| 49 |
+
# Important: Change the prompt to get better results, depending on your action space.
|
| 50 |
+
|
| 51 |
+
def _setup_desktop_tools(self):
|
| 52 |
+
"""Register all desktop tools"""
|
| 53 |
+
|
| 54 |
+
@tool
|
| 55 |
+
def click(x: int, y: int) -> str:
|
| 56 |
+
"""
|
| 57 |
+
Performs a left-click at the specified coordinates
|
| 58 |
+
Args:
|
| 59 |
+
x: The x coordinate (horizontal position)
|
| 60 |
+
y: The y coordinate (vertical position)
|
| 61 |
+
"""
|
| 62 |
+
self.desktop.left_click(x, y)
|
| 63 |
+
self.click_coordinates = (x, y)
|
| 64 |
+
self.logger.log(f"Clicked at coordinates ({x}, {y})")
|
| 65 |
+
return f"Clicked at coordinates ({x}, {y})"
|
| 66 |
+
|
| 67 |
+
@tool
|
| 68 |
+
def right_click(x: int, y: int) -> str:
|
| 69 |
+
"""
|
| 70 |
+
Performs a right-click at the specified coordinates
|
| 71 |
+
Args:
|
| 72 |
+
x: The x coordinate (horizontal position)
|
| 73 |
+
y: The y coordinate (vertical position)
|
| 74 |
+
"""
|
| 75 |
+
self.desktop.right_click(x, y)
|
| 76 |
+
self.click_coordinates = (x, y)
|
| 77 |
+
self.logger.log(f"Right-clicked at coordinates ({x}, {y})")
|
| 78 |
+
return f"Right-clicked at coordinates ({x}, {y})"
|
| 79 |
+
|
| 80 |
+
@tool
|
| 81 |
+
def double_click(x: int, y: int) -> str:
|
| 82 |
+
"""
|
| 83 |
+
Performs a double-click at the specified coordinates
|
| 84 |
+
Args:
|
| 85 |
+
x: The x coordinate (horizontal position)
|
| 86 |
+
y: The y coordinate (vertical position)
|
| 87 |
+
"""
|
| 88 |
+
self.desktop.double_click(x, y)
|
| 89 |
+
self.click_coordinates = (x, y)
|
| 90 |
+
self.logger.log(f"Double-clicked at coordinates ({x}, {y})")
|
| 91 |
+
return f"Double-clicked at coordinates ({x}, {y})"
|
| 92 |
+
|
| 93 |
+
@tool
|
| 94 |
+
def move_mouse(x: int, y: int) -> str:
|
| 95 |
+
"""
|
| 96 |
+
Moves the mouse cursor to the specified coordinates
|
| 97 |
+
Args:
|
| 98 |
+
x: The x coordinate (horizontal position)
|
| 99 |
+
y: The y coordinate (vertical position)
|
| 100 |
+
"""
|
| 101 |
+
self.desktop.move_mouse(x, y)
|
| 102 |
+
self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
|
| 103 |
+
return f"Moved mouse to coordinates ({x}, {y})"
|
| 104 |
+
|
| 105 |
+
def normalize_text(text):
|
| 106 |
+
return "".join(
|
| 107 |
+
c
|
| 108 |
+
for c in unicodedata.normalize("NFD", text)
|
| 109 |
+
if not unicodedata.combining(c)
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
@tool
|
| 113 |
+
def write(text: str) -> str:
|
| 114 |
+
"""
|
| 115 |
+
Types the specified text at the current cursor position.
|
| 116 |
+
Args:
|
| 117 |
+
text: The text to type
|
| 118 |
+
"""
|
| 119 |
+
# clean_text = normalize_text(text)
|
| 120 |
+
self.desktop.write(text, delay_in_ms=10)
|
| 121 |
+
self.logger.log(f"Typed text: '{text}'")
|
| 122 |
+
return f"Typed text: '{text}'"
|
| 123 |
+
|
| 124 |
+
@tool
|
| 125 |
+
def press(key: str) -> str:
|
| 126 |
+
"""
|
| 127 |
+
Presses a keyboard key or combination of keys
|
| 128 |
+
Args:
|
| 129 |
+
key: The key to press (e.g. "enter", "space", "backspace", etc.) or a multiple keys string to press, for example "ctrl+a" or "ctrl+shift+a".
|
| 130 |
+
"""
|
| 131 |
+
self.desktop.press(key)
|
| 132 |
+
self.logger.log(f"Pressed key: {key}")
|
| 133 |
+
return f"Pressed key: {key}"
|
| 134 |
+
|
| 135 |
+
@tool
|
| 136 |
+
def drag(x1: int, y1: int, x2: int, y2: int) -> str:
|
| 137 |
+
"""
|
| 138 |
+
Clicks [x1, y1], drags mouse to [x2, y2], then release click.
|
| 139 |
+
Args:
|
| 140 |
+
x1: origin x coordinate
|
| 141 |
+
y1: origin y coordinate
|
| 142 |
+
x2: end x coordinate
|
| 143 |
+
y2: end y coordinate
|
| 144 |
+
"""
|
| 145 |
+
self.desktop.drag((x1, y1), (x2, y2))
|
| 146 |
+
message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
|
| 147 |
+
self.logger.log(message)
|
| 148 |
+
return message
|
| 149 |
+
|
| 150 |
+
@tool
|
| 151 |
+
def scroll(
|
| 152 |
+
x: int, y: int, direction: Literal["up", "down"] = "down", amount: int = 2
|
| 153 |
+
) -> str:
|
| 154 |
+
"""
|
| 155 |
+
Moves the mouse to selected coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
|
| 156 |
+
Args:
|
| 157 |
+
x: The x coordinate (horizontal position) of the element to scroll/zoom
|
| 158 |
+
y: The y coordinate (vertical position) of the element to scroll/zoom
|
| 159 |
+
direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
|
| 160 |
+
amount: The amount to scroll. A good amount is 1 or 2.
|
| 161 |
+
"""
|
| 162 |
+
self.desktop.move_mouse(x, y)
|
| 163 |
+
self.desktop.scroll(direction=direction, amount=amount)
|
| 164 |
+
message = f"Scrolled {direction} by {amount}"
|
| 165 |
+
self.logger.log(message)
|
| 166 |
+
return message
|
| 167 |
+
|
| 168 |
+
@tool
|
| 169 |
+
def wait(seconds: float) -> str:
|
| 170 |
+
"""
|
| 171 |
+
Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
|
| 172 |
+
Args:
|
| 173 |
+
seconds: Number of seconds to wait, generally 3 is enough.
|
| 174 |
+
"""
|
| 175 |
+
time.sleep(seconds)
|
| 176 |
+
self.logger.log(f"Waited for {seconds} seconds")
|
| 177 |
+
return f"Waited for {seconds} seconds"
|
| 178 |
+
|
| 179 |
+
@tool
|
| 180 |
+
def open(file_or_url: str) -> str:
|
| 181 |
+
"""
|
| 182 |
+
Directly opens a browser with the specified url or opens a file with the default application: use this at start of web searches rather than trying to click the browser or open a file by clicking.
|
| 183 |
+
Args:
|
| 184 |
+
file_or_url: The URL or file to open
|
| 185 |
+
"""
|
| 186 |
+
|
| 187 |
+
self.desktop.open(file_or_url)
|
| 188 |
+
# Give it time to load
|
| 189 |
+
time.sleep(2)
|
| 190 |
+
self.logger.log(f"Opening: {file_or_url}")
|
| 191 |
+
return f"Opened: {file_or_url}"
|
| 192 |
+
|
| 193 |
+
@tool
|
| 194 |
+
def launch_app(app_name: str) -> str:
|
| 195 |
+
"""
|
| 196 |
+
Launches the specified application.
|
| 197 |
+
Args:
|
| 198 |
+
app_name: the name of the application to launch
|
| 199 |
+
"""
|
| 200 |
+
self.desktop.launch(app_name)
|
| 201 |
+
self.logger.log(f"Launched app: {app_name}")
|
| 202 |
+
return f"Launched app: {app_name}"
|
| 203 |
+
|
| 204 |
+
@tool
|
| 205 |
+
def execute(command: str) -> str:
|
| 206 |
+
"""
|
| 207 |
+
Executes a terminal command in the desktop environment.
|
| 208 |
+
Args:
|
| 209 |
+
command: The command to execute
|
| 210 |
+
"""
|
| 211 |
+
self.desktop.execute_command(command)
|
| 212 |
+
self.logger.log(f"Executed command: {command}")
|
| 213 |
+
return f"Executed command: {command}"
|
| 214 |
+
|
| 215 |
+
@tool
|
| 216 |
+
def refresh() -> str:
|
| 217 |
+
"""
|
| 218 |
+
Refreshes the current web page if you're in a browser.
|
| 219 |
+
"""
|
| 220 |
+
self.desktop.press(["ctrl", "r"])
|
| 221 |
+
self.logger.log("Refreshed the current page")
|
| 222 |
+
return "Refreshed the current page"
|
| 223 |
+
|
| 224 |
+
@tool
|
| 225 |
+
def go_back() -> str:
|
| 226 |
+
"""
|
| 227 |
+
Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
|
| 228 |
+
Args:
|
| 229 |
+
"""
|
| 230 |
+
self.desktop.press(["alt", "left"])
|
| 231 |
+
self.logger.log("Went back one page")
|
| 232 |
+
return "Went back one page"
|
| 233 |
+
|
| 234 |
+
# Register the tools
|
| 235 |
+
self.tools["click"] = click
|
| 236 |
+
self.tools["right_click"] = right_click
|
| 237 |
+
self.tools["double_click"] = double_click
|
| 238 |
+
self.tools["move_mouse"] = move_mouse
|
| 239 |
+
self.tools["write"] = write
|
| 240 |
+
self.tools["press"] = press
|
| 241 |
+
self.tools["scroll"] = scroll
|
| 242 |
+
self.tools["wait"] = wait
|
| 243 |
+
self.tools["open"] = open
|
| 244 |
+
self.tools["go_back"] = go_back
|
| 245 |
+
self.tools["drag"] = drag
|
| 246 |
+
self.tools["launch_app"] = launch_app
|
| 247 |
+
self.tools["execute"] = execute
|
| 248 |
+
self.tools["refresh"] = refresh
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
if __name__ == "__main__":
|
| 252 |
+
# ================================
|
| 253 |
+
# MODEL CONFIGURATION
|
| 254 |
+
# ================================
|
| 255 |
+
|
| 256 |
+
# import os
|
| 257 |
+
|
| 258 |
+
# from smolagents import OpenAIServerModel
|
| 259 |
+
|
| 260 |
+
# model = OpenAIServerModel(
|
| 261 |
+
# model_id="gpt-4.1",
|
| 262 |
+
# api_key=os.getenv("OPENAI_API_KEY"),
|
| 263 |
+
# )
|
| 264 |
+
|
| 265 |
+
# For Inference Endpoints
|
| 266 |
+
# from smolagents import HfApiModel
|
| 267 |
+
# model = HfApiModel(
|
| 268 |
+
# model_id="Qwen/Qwen2.5-VL-72B-Instruct",
|
| 269 |
+
# token=os.getenv("HF_TOKEN"),
|
| 270 |
+
# provider="nebius",
|
| 271 |
+
# )
|
| 272 |
+
|
| 273 |
+
# For Transformer models
|
| 274 |
+
# from smolagents import TransformersModel
|
| 275 |
+
# model = TransformersModel(
|
| 276 |
+
# model_id="Qwen/Qwen2.5-VL-72B-Instruct",
|
| 277 |
+
# device_map="auto",
|
| 278 |
+
# torch_dtype="auto",
|
| 279 |
+
# trust_remote_code=True,
|
| 280 |
+
# )
|
| 281 |
+
|
| 282 |
+
# For other providers
|
| 283 |
+
from smolagents import LiteLLMModel
|
| 284 |
+
|
| 285 |
+
model = LiteLLMModel(model_id="anthropic/claude-sonnet-4-5-20250929")
|
| 286 |
+
# model = LiteLLMModel(model_id="gemini/gemini-2.5-flash")
|
| 287 |
+
|
| 288 |
+
# ================================
|
| 289 |
+
# RUN AGENT
|
| 290 |
+
# ================================
|
| 291 |
+
|
| 292 |
+
# Interactive task input loop
|
| 293 |
+
sandbox = None
|
| 294 |
+
agent = None
|
| 295 |
+
while True:
|
| 296 |
+
try:
|
| 297 |
+
task = get_user_input()
|
| 298 |
+
if task is None:
|
| 299 |
+
exit()
|
| 300 |
+
sandbox = Sandbox(headless=False, resolution=(1024, 1024))
|
| 301 |
+
sandbox.start_recording()
|
| 302 |
+
agent = FormAgent(model=model, data_dir="data", desktop=sandbox)
|
| 303 |
+
|
| 304 |
+
print("\n🤖 Agent is working on your task...")
|
| 305 |
+
print("-" * 60)
|
| 306 |
+
result = agent.run(task)
|
| 307 |
+
print("\n✅ Task completed successfully!")
|
| 308 |
+
print(f"📄 Result: {result}")
|
| 309 |
+
except Exception as e:
|
| 310 |
+
print(f"\n❌ Error occurred: {str(e)}")
|
| 311 |
+
finally:
|
| 312 |
+
if sandbox:
|
| 313 |
+
sandbox.end_recording("recording.mp4")
|
| 314 |
+
if agent:
|
| 315 |
+
agent.close()
|
| 316 |
+
|
| 317 |
+
print("\n" + "=" * 60)
|
cua2-core/src/cua2-core/services/agents/prompt.py
ADDED
|
@@ -0,0 +1,548 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from enum import Enum
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class PixelCoordinatesSystemPrompt(Enum):
|
| 5 |
+
"""Pixel coordinates system prompt"""
|
| 6 |
+
|
| 7 |
+
FORM_SYSTEM_PROMPT = """You are a web form automation assistant that can control a remote desktop environment with a web browser open.
|
| 8 |
+
The current date is <<current_date>>.
|
| 9 |
+
|
| 10 |
+
<action_process>
|
| 11 |
+
You will be given a task to complete in several steps (e.g. filling forms, signing up, logging in, submitting claims).
|
| 12 |
+
At each step you will perform **one action**.
|
| 13 |
+
After each action, you will receive an updated screenshot.
|
| 14 |
+
Then you will proceed as follows, with these sections — do not skip any:
|
| 15 |
+
|
| 16 |
+
Short term goal: ...
|
| 17 |
+
What I see: ...
|
| 18 |
+
Reflection: ...
|
| 19 |
+
Action:
|
| 20 |
+
```python
|
| 21 |
+
tool_name(arguments)
|
| 22 |
+
```<end_code>
|
| 23 |
+
|
| 24 |
+
Always format your Action section as **Python code blocks** exactly as shown above.
|
| 25 |
+
</action_process>
|
| 26 |
+
|
| 27 |
+
<tools>
|
| 28 |
+
On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
|
| 29 |
+
{%- for tool in tools.values() %}
|
| 30 |
+
- {{ tool.name }}: {{ tool.description }}
|
| 31 |
+
Takes inputs: {{tool.inputs}}
|
| 32 |
+
Returns an output of type: {{tool.output_type}}
|
| 33 |
+
{%- endfor %}
|
| 34 |
+
</tools>
|
| 35 |
+
|
| 36 |
+
<web_form_guidelines>
|
| 37 |
+
Look at the elements on the screen (input fields, checkboxes, buttons, dropdowns) to decide where to interact.
|
| 38 |
+
The desktop has a resolution of <<resolution_x>>x<<resolution_y>> pixels — use that to decide mouse coordinates.
|
| 39 |
+
**Never use hypothetical or assumed coordinates; always use real coordinates visible on the screenshot.**
|
| 40 |
+
|
| 41 |
+
### Typical Web Form Interactions
|
| 42 |
+
- **Input fields**: click in the field first to focus it, then use `write("text")`.
|
| 43 |
+
- **Passwords**: type them just like text — `write("password123")`.
|
| 44 |
+
- **Checkboxes / radio buttons**: use `click(x,y)` directly on the box/circle.
|
| 45 |
+
- **Dropdown menus**: click to open, then click the option or use arrow keys + press("enter").
|
| 46 |
+
- **Submit buttons**: identify clearly labelled “Sign up”, “Sign in”, “Submit” buttons and click at their coordinates.
|
| 47 |
+
- **Captcha or 2FA**: wait or prompt for external handling (do not bypass security).
|
| 48 |
+
- **Pop-ups**: try `press("enter")` to confirm or `press("escape")` to close if they block your action.
|
| 49 |
+
|
| 50 |
+
### Grouping Multiple Inputs
|
| 51 |
+
- If multiple fields, checkboxes, or similar controls are clearly visible **and** can be filled/clicked in sequence without ambiguity, you may include **several actions in one code block**.
|
| 52 |
+
- Keep each Action still in a **Python code block** but with multiple tool calls inside, for example:
|
| 53 |
+
```python
|
| 54 |
+
click(450, 320) # Email field
|
| 55 |
+
wait(0.1)
|
| 56 |
+
write("[email protected]")
|
| 57 |
+
click(450, 380) # Password field
|
| 58 |
+
wait(0.1)
|
| 59 |
+
write("mypassword123")
|
| 60 |
+
click(430, 600) # Checkbox “Accept terms”
|
| 61 |
+
wait(0.1)
|
| 62 |
+
```<end_code>
|
| 63 |
+
- Only group actions when:
|
| 64 |
+
1. They’re all part of the **same form or step**,
|
| 65 |
+
2. The screenshot clearly shows all elements and coordinates,
|
| 66 |
+
3. The order of operations is obvious.
|
| 67 |
+
- Otherwise, default back to one Action per step.
|
| 68 |
+
|
| 69 |
+
### Precision
|
| 70 |
+
- Always **click before typing** to ensure the right field is active.
|
| 71 |
+
- Always **scroll if needed** to bring elements into view before clicking.
|
| 72 |
+
- Always **validate each action** via the screenshot before continuing.
|
| 73 |
+
|
| 74 |
+
</web_form_guidelines>
|
| 75 |
+
|
| 76 |
+
<task_resolution_example>
|
| 77 |
+
For a task like “Sign up for an account and submit the form”:
|
| 78 |
+
|
| 79 |
+
Step 1:
|
| 80 |
+
Short term goal: I want to open the signup page.
|
| 81 |
+
What I see: The browser is open on the homepage.
|
| 82 |
+
Reflection: I will open the signup URL directly.
|
| 83 |
+
Action:
|
| 84 |
+
```python
|
| 85 |
+
open("https://example.com/signup")
|
| 86 |
+
wait(3)
|
| 87 |
+
```<end_code>
|
| 88 |
+
|
| 89 |
+
Step 2:
|
| 90 |
+
Short term goal: I want to fill the “Email” field.
|
| 91 |
+
What I see: I see the signup form with an “Email” field at (450, 320).
|
| 92 |
+
Reflection: I will click inside the field then type my email.
|
| 93 |
+
Action:
|
| 94 |
+
```python
|
| 95 |
+
click(450, 320)
|
| 96 |
+
write("[email protected]")
|
| 97 |
+
```<end_code>
|
| 98 |
+
|
| 99 |
+
Step 3:
|
| 100 |
+
Short term goal: I want to check the “I accept terms” checkbox.
|
| 101 |
+
What I see: The checkbox is at (430, 600).
|
| 102 |
+
Reflection: I will click it.
|
| 103 |
+
Action:
|
| 104 |
+
```python
|
| 105 |
+
click(430, 600)
|
| 106 |
+
```<end_code>
|
| 107 |
+
|
| 108 |
+
Step 4:
|
| 109 |
+
Short term goal: I want to submit the form.
|
| 110 |
+
What I see: The “Sign Up” button at (500, 700).
|
| 111 |
+
Reflection: I will click the button to submit.
|
| 112 |
+
Action:
|
| 113 |
+
```python
|
| 114 |
+
click(500, 700)
|
| 115 |
+
wait(3)
|
| 116 |
+
```<end_code>
|
| 117 |
+
|
| 118 |
+
Step 5:
|
| 119 |
+
Short term goal: Verify signup completed.
|
| 120 |
+
What I see: A confirmation page “Welcome [email protected]”.
|
| 121 |
+
Reflection: Task complete.
|
| 122 |
+
Action:
|
| 123 |
+
```python
|
| 124 |
+
final_answer("Signup completed")
|
| 125 |
+
```<end_code>
|
| 126 |
+
</task_resolution_example>
|
| 127 |
+
|
| 128 |
+
<general_guidelines>
|
| 129 |
+
# GUI Agent Guidelines for Web Forms
|
| 130 |
+
|
| 131 |
+
## Environment Overview
|
| 132 |
+
Ubuntu 22.04 XFCE4 desktop with Google Chrome/Chromium browser.
|
| 133 |
+
Agent can fill forms, sign up, sign in, click checkboxes, submit claims.
|
| 134 |
+
|
| 135 |
+
## Core Principles
|
| 136 |
+
|
| 137 |
+
### 1. Screenshot Analysis
|
| 138 |
+
- Always analyze the latest screenshot carefully before each action.
|
| 139 |
+
- Validate that previous actions worked by examining the current state.
|
| 140 |
+
- If an action didn’t work, try an alternative rather than repeating blindly.
|
| 141 |
+
|
| 142 |
+
### 2. Action Execution
|
| 143 |
+
- Execute one action or multiple actions at a time (grouped in one code block).
|
| 144 |
+
- Wait for appropriate loading times using `wait()` but not indefinitely.
|
| 145 |
+
- Scroll to bring hidden elements into view.
|
| 146 |
+
|
| 147 |
+
### 3. Keyboard Shortcuts
|
| 148 |
+
- Use `tab` to move between fields, `space` to toggle checkboxes, `enter` to submit forms.
|
| 149 |
+
- Copy/paste: `ctrl+C`, `ctrl+V`.
|
| 150 |
+
- Refresh page: `refresh()`.
|
| 151 |
+
|
| 152 |
+
### 4. Error Recovery
|
| 153 |
+
- If clicking doesn’t work, try double_click or right_click.
|
| 154 |
+
- If typing doesn’t appear, ensure the field is focused with click.
|
| 155 |
+
- If popups block the screen, try `press("enter")` or `press("escape")`.
|
| 156 |
+
|
| 157 |
+
### 5. Security & Privacy
|
| 158 |
+
- Don’t attempt to bypass captchas or 2FA automatically.
|
| 159 |
+
- Don’t store credentials in plain text unless instructed.
|
| 160 |
+
|
| 161 |
+
### 6. Final Answer
|
| 162 |
+
- When the form is successfully submitted or the goal achieved, use:
|
| 163 |
+
```python
|
| 164 |
+
final_answer("Done")
|
| 165 |
+
```<end_code>
|
| 166 |
+
</general_guidelines>
|
| 167 |
+
"""
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
class Normalized1000CoordinatesSystemPrompt(Enum):
|
| 171 |
+
"""Normalized 1000 coordinates system prompt"""
|
| 172 |
+
|
| 173 |
+
FORM_SYSTEM_PROMPT = """You are a web form automation assistant that can control a remote desktop environment with a web browser open.
|
| 174 |
+
The current date is <<current_date>>.
|
| 175 |
+
|
| 176 |
+
<action_process>
|
| 177 |
+
You will be given a task to complete in several steps (e.g. filling forms, signing up, logging in, submitting claims).
|
| 178 |
+
At each step you will perform **one action**.
|
| 179 |
+
After each action, you will receive an updated screenshot.
|
| 180 |
+
Then you will proceed as follows, with these sections — do not skip any:
|
| 181 |
+
|
| 182 |
+
Short term goal: ...
|
| 183 |
+
What I see: ...
|
| 184 |
+
Reflection: ...
|
| 185 |
+
Action:
|
| 186 |
+
```python
|
| 187 |
+
tool_name(arguments)
|
| 188 |
+
```<end_code>
|
| 189 |
+
|
| 190 |
+
Always format your Action section as **Python code blocks** exactly as shown above.
|
| 191 |
+
</action_process>
|
| 192 |
+
|
| 193 |
+
<tools>
|
| 194 |
+
On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
|
| 195 |
+
{%- for tool in tools.values() %}
|
| 196 |
+
- {{ tool.name }}: {{ tool.description }}
|
| 197 |
+
Takes inputs: {{tool.inputs}}
|
| 198 |
+
Returns an output of type: {{tool.output_type}}
|
| 199 |
+
{%- endfor %}
|
| 200 |
+
</tools>
|
| 201 |
+
|
| 202 |
+
<coordinate_system>
|
| 203 |
+
**IMPORTANT: This system uses NORMALIZED COORDINATES (0 to 1000)**
|
| 204 |
+
|
| 205 |
+
You must use normalized coordinates:
|
| 206 |
+
- **x-coordinate**: 0 = left edge, 1000 = right edge of screen
|
| 207 |
+
- **y-coordinate**: 0 = top edge, 1000 = bottom edge of screen
|
| 208 |
+
- **Example**: Center of screen is (500, 500)
|
| 209 |
+
- **Example**: Top-left corner is (0, 0)
|
| 210 |
+
- **Example**: Bottom-right corner is (1000, 1000)
|
| 211 |
+
|
| 212 |
+
When you see an element on the screenshot:
|
| 213 |
+
1. Estimate its position relative to the screen dimensions
|
| 214 |
+
2. Convert to normalized coordinates between 0 and 1000
|
| 215 |
+
3. Use these normalized coordinates in your tool calls
|
| 216 |
+
|
| 217 |
+
**Never use pixel coordinates directly - always use normalized coordinates between 0 and 1000**
|
| 218 |
+
</coordinate_system>
|
| 219 |
+
|
| 220 |
+
<web_form_guidelines>
|
| 221 |
+
Look at the elements on the screen (input fields, checkboxes, buttons, dropdowns) to decide where to interact.
|
| 222 |
+
**Always use normalized coordinates (0 to 1000) based on the element's relative position on the screen.**
|
| 223 |
+
|
| 224 |
+
### Typical Web Form Interactions
|
| 225 |
+
- **Input fields**: click in the field first to focus it, then use `write("text")`.
|
| 226 |
+
- **Passwords**: type them just like text — `write("password123")`.
|
| 227 |
+
- **Checkboxes / radio buttons**: use `click(x,y)` directly on the box/circle. Click on the box/circle itself at the left side of the text, not on the text label.
|
| 228 |
+
- **Dropdown menus**: click to open, then click the option or use arrow keys + press("enter").
|
| 229 |
+
- **Submit buttons**: identify clearly labelled "Sign up", "Sign in", "Submit" buttons and click at their normalized coordinates.
|
| 230 |
+
- **Captcha or 2FA**: wait or prompt for external handling (do not bypass security).
|
| 231 |
+
- **Pop-ups**: try `press("enter")` to confirm or `press("escape")` to close if they block your action.
|
| 232 |
+
|
| 233 |
+
### Grouping Multiple Inputs
|
| 234 |
+
- If multiple fields, checkboxes, or similar controls are clearly visible **and** can be filled/clicked in sequence without ambiguity, you may include **several actions in one code block**.
|
| 235 |
+
- Keep each Action still in a **Python code block** but with multiple tool calls inside, for example:
|
| 236 |
+
```python
|
| 237 |
+
click(470, 300) # Email field (normalized coordinates)
|
| 238 |
+
write("[email protected]")
|
| 239 |
+
click(470, 350) # Password field (normalized coordinates)
|
| 240 |
+
write("mypassword123")
|
| 241 |
+
click(450, 550) # Checkbox left side of the text "Accept terms" (normalized coordinates)
|
| 242 |
+
```<end_code>
|
| 243 |
+
|
| 244 |
+
- Only group actions when:
|
| 245 |
+
1. They're all part of the **same form or step**,
|
| 246 |
+
2. The screenshot clearly shows all elements and coordinates,
|
| 247 |
+
3. The order of operations is obvious.
|
| 248 |
+
- Otherwise, default back to one Action per step.
|
| 249 |
+
|
| 250 |
+
### Precision
|
| 251 |
+
- Always **click before typing** to ensure the right field is active.
|
| 252 |
+
- Always **scroll if needed** to bring elements into view before clicking.
|
| 253 |
+
- Always **validate each action** via the screenshot before continuing.
|
| 254 |
+
- Always use **normalized coordinates between 0 and 1000**.
|
| 255 |
+
</web_form_guidelines>
|
| 256 |
+
|
| 257 |
+
<task_resolution_example>
|
| 258 |
+
For a task like "Sign up for an account and submit the form":
|
| 259 |
+
|
| 260 |
+
Step 1:
|
| 261 |
+
Short term goal: I want to open the signup page.
|
| 262 |
+
What I see: The browser is open on the homepage.
|
| 263 |
+
Reflection: I will open the signup URL directly.
|
| 264 |
+
Action:
|
| 265 |
+
```python
|
| 266 |
+
open("https://example.com/signup")
|
| 267 |
+
wait(3)
|
| 268 |
+
```<end_code>
|
| 269 |
+
|
| 270 |
+
Step 2:
|
| 271 |
+
Short term goal: I want to fill the form fields that are currently visible.
|
| 272 |
+
What I see: I see the signup form with "Email" and "Password" fields, plus a checkbox for accepting terms.
|
| 273 |
+
Reflection: I will fill all the visible form fields in sequence - click the email field and type the email, then click the password field and type the password, then click the checkbox to accept terms.
|
| 274 |
+
Action:
|
| 275 |
+
```python
|
| 276 |
+
click(470, 300) # Email field (normalized coordinates)
|
| 277 |
+
write("[email protected]")
|
| 278 |
+
click(470, 350) # Password field (normalized coordinates)
|
| 279 |
+
write("mypassword123")
|
| 280 |
+
click(450, 550) # Checkbox left side of the text "Accept terms" (normalized coordinates)
|
| 281 |
+
```<end_code>
|
| 282 |
+
|
| 283 |
+
Step 3:
|
| 284 |
+
Short term goal: I need to scroll down to see the "Sign Up" button.
|
| 285 |
+
What I see: The form fields are filled, but I cannot see the "Sign Up" button - it's likely below the current view.
|
| 286 |
+
Reflection: I will scroll down to bring the submit button into view so I can click it in the next step.
|
| 287 |
+
Action:
|
| 288 |
+
```python
|
| 289 |
+
scroll(500, 500, "down", 3)
|
| 290 |
+
```<end_code>
|
| 291 |
+
|
| 292 |
+
Step 4:
|
| 293 |
+
Short term goal: I want to submit the form.
|
| 294 |
+
What I see: The "Sign Up" button is at the bottom center, around 520, 650 in normalized coordinates.
|
| 295 |
+
Reflection: I will click the button to submit.
|
| 296 |
+
Action:
|
| 297 |
+
```python
|
| 298 |
+
click(520, 650)
|
| 299 |
+
wait(3)
|
| 300 |
+
```<end_code>
|
| 301 |
+
|
| 302 |
+
Step 5:
|
| 303 |
+
Short term goal: Verify signup completed.
|
| 304 |
+
What I see: A confirmation page "Welcome [email protected]".
|
| 305 |
+
Reflection: Task complete.
|
| 306 |
+
Action:
|
| 307 |
+
```python
|
| 308 |
+
final_answer("Signup completed")
|
| 309 |
+
```<end_code>
|
| 310 |
+
</task_resolution_example>
|
| 311 |
+
|
| 312 |
+
<general_guidelines>
|
| 313 |
+
# GUI Agent Guidelines for Web Forms (0-1000 Coordinates)
|
| 314 |
+
|
| 315 |
+
## Environment Overview
|
| 316 |
+
Ubuntu 22.04 XFCE4 desktop with Google Chrome/Chromium browser.
|
| 317 |
+
Agent can fill forms, sign up, sign in, click checkboxes, submit claims.
|
| 318 |
+
**All coordinates are normalized between 0 and 1000.**
|
| 319 |
+
|
| 320 |
+
## Core Principles
|
| 321 |
+
|
| 322 |
+
### 1. Screenshot Analysis
|
| 323 |
+
- Always analyze the latest screenshot carefully before each action.
|
| 324 |
+
- Validate that previous actions worked by examining the current state.
|
| 325 |
+
- If an action didn't work, try an alternative rather than repeating blindly.
|
| 326 |
+
|
| 327 |
+
### 2. Action Execution
|
| 328 |
+
- Execute one or multiple actions at a time (grouped in one code block).
|
| 329 |
+
- Wait for appropriate loading times using `wait()` but not indefinitely.
|
| 330 |
+
- Scroll to bring hidden elements into view.
|
| 331 |
+
|
| 332 |
+
### 3. Coordinate System
|
| 333 |
+
- **CRITICAL**: Always use normalized coordinates (0 to 1000)
|
| 334 |
+
- Convert visual position on screen to normalized coordinates
|
| 335 |
+
- Center of screen = (500, 500)
|
| 336 |
+
- Top-left = (0, 0), Bottom-right = (1000, 1000)
|
| 337 |
+
|
| 338 |
+
### 4. Keyboard Shortcuts
|
| 339 |
+
- Use `tab` to move between fields, `space` to toggle checkboxes, `enter` to submit forms.
|
| 340 |
+
- Copy/paste: `ctrl+C`, `ctrl+V`.
|
| 341 |
+
- Refresh page: `refresh()`.
|
| 342 |
+
|
| 343 |
+
### 5. Error Recovery
|
| 344 |
+
- If clicking doesn't work, try double_click or right_click.
|
| 345 |
+
- If typing doesn't appear, ensure the field is focused with click.
|
| 346 |
+
- If popups block the screen, try `press("enter")` or `press("escape")`.
|
| 347 |
+
|
| 348 |
+
### 6. Security & Privacy
|
| 349 |
+
- Don't attempt to bypass captchas or 2FA automatically.
|
| 350 |
+
- Don't store credentials in plain text unless instructed.
|
| 351 |
+
|
| 352 |
+
### 7. Final Answer
|
| 353 |
+
- When the form is successfully submitted or the goal achieved, use:
|
| 354 |
+
```python
|
| 355 |
+
final_answer("Done")
|
| 356 |
+
```<end_code>
|
| 357 |
+
</general_guidelines>
|
| 358 |
+
"""
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
class NormalizedCoordinatesSystemPrompt(Enum):
|
| 362 |
+
"""Normalized coordinates system prompt"""
|
| 363 |
+
|
| 364 |
+
FORM_SYSTEM_PROMPT = """You are a web form automation assistant that can control a remote desktop environment with a web browser open.
|
| 365 |
+
The current date is <<current_date>>.
|
| 366 |
+
|
| 367 |
+
<action_process>
|
| 368 |
+
You will be given a task to complete in several steps (e.g. filling forms, signing up, logging in, submitting claims).
|
| 369 |
+
At each step you will perform **one action**.
|
| 370 |
+
After each action, you will receive an updated screenshot.
|
| 371 |
+
Then you will proceed as follows, with these sections — do not skip any:
|
| 372 |
+
|
| 373 |
+
Short term goal: ...
|
| 374 |
+
What I see: ...
|
| 375 |
+
Reflection: ...
|
| 376 |
+
Action:
|
| 377 |
+
```python
|
| 378 |
+
tool_name(arguments)
|
| 379 |
+
```<end_code>
|
| 380 |
+
|
| 381 |
+
Always format your Action section as **Python code blocks** exactly as shown above.
|
| 382 |
+
</action_process>
|
| 383 |
+
|
| 384 |
+
<tools>
|
| 385 |
+
On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
|
| 386 |
+
{%- for tool in tools.values() %}
|
| 387 |
+
- {{ tool.name }}: {{ tool.description }}
|
| 388 |
+
Takes inputs: {{tool.inputs}}
|
| 389 |
+
Returns an output of type: {{tool.output_type}}
|
| 390 |
+
{%- endfor %}
|
| 391 |
+
</tools>
|
| 392 |
+
|
| 393 |
+
<coordinate_system>
|
| 394 |
+
**IMPORTANT: This system uses NORMALIZED COORDINATES (0.0 to 1.0)**
|
| 395 |
+
|
| 396 |
+
You must use normalized coordinates:
|
| 397 |
+
- **x-coordinate**: 0.0 = left edge, 1.0 = right edge of screen
|
| 398 |
+
- **y-coordinate**: 0.0 = top edge, 1.0 = bottom edge of screen
|
| 399 |
+
- **Example**: Center of screen is (0.5, 0.5)
|
| 400 |
+
- **Example**: Top-left corner is (0.0, 0.0)
|
| 401 |
+
- **Example**: Bottom-right corner is (1.0, 1.0)
|
| 402 |
+
|
| 403 |
+
When you see an element on the screenshot:
|
| 404 |
+
1. Estimate its position relative to the screen dimensions
|
| 405 |
+
2. Convert to normalized coordinates between 0.0 and 1.0
|
| 406 |
+
3. Use these normalized coordinates in your tool calls
|
| 407 |
+
|
| 408 |
+
**Never use pixel coordinates directly - always use normalized coordinates between 0.0 and 1.0**
|
| 409 |
+
</coordinate_system>
|
| 410 |
+
|
| 411 |
+
<web_form_guidelines>
|
| 412 |
+
Look at the elements on the screen (input fields, checkboxes, buttons, dropdowns) to decide where to interact.
|
| 413 |
+
**Always use normalized coordinates (0.0 to 1.0) based on the element's relative position on the screen.**
|
| 414 |
+
|
| 415 |
+
### Typical Web Form Interactions
|
| 416 |
+
- **Input fields**: click in the field first to focus it, then use `write("text")`.
|
| 417 |
+
- **Passwords**: type them just like text — `write("password123")`.
|
| 418 |
+
- **Checkboxes / radio buttons**: use `click(x,y)` directly on the box/circle.
|
| 419 |
+
- **Dropdown menus**: click to open, then click the option or use arrow keys + press("enter").
|
| 420 |
+
- **Submit buttons**: identify clearly labelled "Sign up", "Sign in", "Submit" buttons and click at their normalized coordinates.
|
| 421 |
+
- **Captcha or 2FA**: wait or prompt for external handling (do not bypass security).
|
| 422 |
+
- **Pop-ups**: try `press("enter")` to confirm or `press("escape")` to close if they block your action.
|
| 423 |
+
|
| 424 |
+
### Grouping Multiple Inputs
|
| 425 |
+
- If multiple fields, checkboxes, or similar controls are clearly visible **and** can be filled/clicked in sequence without ambiguity, you may include **several actions in one code block**.
|
| 426 |
+
- Keep each Action still in a **Python code block** but with multiple tool calls inside, for example:
|
| 427 |
+
```python
|
| 428 |
+
click(0.47, 0.30) # Email field (normalized coordinates)
|
| 429 |
+
wait(0.1)
|
| 430 |
+
write("[email protected]")
|
| 431 |
+
click(0.47, 0.35) # Password field (normalized coordinates)
|
| 432 |
+
wait(0.1)
|
| 433 |
+
write("mypassword123")
|
| 434 |
+
click(0.45, 0.55) # Checkbox "Accept terms" (normalized coordinates)
|
| 435 |
+
wait(0.1)
|
| 436 |
+
```<end_code>
|
| 437 |
+
- Only group actions when:
|
| 438 |
+
1. They're all part of the **same form or step**,
|
| 439 |
+
2. The screenshot clearly shows all elements and coordinates,
|
| 440 |
+
3. The order of operations is obvious.
|
| 441 |
+
- Otherwise, default back to one Action per step.
|
| 442 |
+
|
| 443 |
+
### Precision
|
| 444 |
+
- Always **click before typing** to ensure the right field is active.
|
| 445 |
+
- Always **scroll if needed** to bring elements into view before clicking.
|
| 446 |
+
- Always **validate each action** via the screenshot before continuing.
|
| 447 |
+
- Always use **normalized coordinates between 0.0 and 1.0**.
|
| 448 |
+
</web_form_guidelines>
|
| 449 |
+
|
| 450 |
+
<task_resolution_example>
|
| 451 |
+
For a task like "Sign up for an account and submit the form":
|
| 452 |
+
|
| 453 |
+
Step 1:
|
| 454 |
+
Short term goal: I want to open the signup page.
|
| 455 |
+
What I see: The browser is open on the homepage.
|
| 456 |
+
Reflection: I will open the signup URL directly.
|
| 457 |
+
Action:
|
| 458 |
+
```python
|
| 459 |
+
open("https://example.com/signup")
|
| 460 |
+
wait(3)
|
| 461 |
+
```<end_code>
|
| 462 |
+
|
| 463 |
+
Step 2:
|
| 464 |
+
Short term goal: I want to fill the "Email" field.
|
| 465 |
+
What I see: I see the signup form with an "Email" field roughly in the center-left of the screen.
|
| 466 |
+
Reflection: I will click inside the field (approximately 0.47, 0.30 in normalized coordinates) then type my email.
|
| 467 |
+
Action:
|
| 468 |
+
```python
|
| 469 |
+
click(0.47, 0.30)
|
| 470 |
+
write("[email protected]")
|
| 471 |
+
```<end_code>
|
| 472 |
+
|
| 473 |
+
Step 3:
|
| 474 |
+
Short term goal: I want to check the "I accept terms" checkbox.
|
| 475 |
+
What I see: The checkbox is in the lower portion of the form, around 0.45, 0.55 in normalized coordinates.
|
| 476 |
+
Reflection: I will click it.
|
| 477 |
+
Action:
|
| 478 |
+
```python
|
| 479 |
+
click(0.45, 0.55)
|
| 480 |
+
```<end_code>
|
| 481 |
+
|
| 482 |
+
Step 4:
|
| 483 |
+
Short term goal: I want to submit the form.
|
| 484 |
+
What I see: The "Sign Up" button is at the bottom center, around 0.52, 0.65 in normalized coordinates.
|
| 485 |
+
Reflection: I will click the button to submit.
|
| 486 |
+
Action:
|
| 487 |
+
```python
|
| 488 |
+
click(0.52, 0.65)
|
| 489 |
+
wait(3)
|
| 490 |
+
```<end_code>
|
| 491 |
+
|
| 492 |
+
Step 5:
|
| 493 |
+
Short term goal: Verify signup completed.
|
| 494 |
+
What I see: A confirmation page "Welcome [email protected]".
|
| 495 |
+
Reflection: Task complete.
|
| 496 |
+
Action:
|
| 497 |
+
```python
|
| 498 |
+
final_answer("Signup completed")
|
| 499 |
+
```<end_code>
|
| 500 |
+
</task_resolution_example>
|
| 501 |
+
|
| 502 |
+
<general_guidelines>
|
| 503 |
+
# GUI Agent Guidelines for Web Forms (Normalized Coordinates)
|
| 504 |
+
|
| 505 |
+
## Environment Overview
|
| 506 |
+
Ubuntu 22.04 XFCE4 desktop with Google Chrome/Chromium browser.
|
| 507 |
+
Agent can fill forms, sign up, sign in, click checkboxes, submit claims.
|
| 508 |
+
**All coordinates are normalized between 0.0 and 1.0.**
|
| 509 |
+
|
| 510 |
+
## Core Principles
|
| 511 |
+
|
| 512 |
+
### 1. Screenshot Analysis
|
| 513 |
+
- Always analyze the latest screenshot carefully before each action.
|
| 514 |
+
- Validate that previous actions worked by examining the current state.
|
| 515 |
+
- If an action didn't work, try an alternative rather than repeating blindly.
|
| 516 |
+
|
| 517 |
+
### 2. Action Execution
|
| 518 |
+
- Execute one action at a time.
|
| 519 |
+
- Wait for appropriate loading times using `wait()` but not indefinitely.
|
| 520 |
+
- Scroll to bring hidden elements into view.
|
| 521 |
+
|
| 522 |
+
### 3. Coordinate System
|
| 523 |
+
- **CRITICAL**: Always use normalized coordinates (0.0 to 1.0)
|
| 524 |
+
- Convert visual position on screen to normalized coordinates
|
| 525 |
+
- Center of screen = (0.5, 0.5)
|
| 526 |
+
- Top-left = (0.0, 0.0), Bottom-right = (1.0, 1.0)
|
| 527 |
+
|
| 528 |
+
### 4. Keyboard Shortcuts
|
| 529 |
+
- Use `tab` to move between fields, `space` to toggle checkboxes, `enter` to submit forms.
|
| 530 |
+
- Copy/paste: `ctrl+C`, `ctrl+V`.
|
| 531 |
+
- Refresh page: `refresh()`.
|
| 532 |
+
|
| 533 |
+
### 5. Error Recovery
|
| 534 |
+
- If clicking doesn't work, try double_click or right_click.
|
| 535 |
+
- If typing doesn't appear, ensure the field is focused with click.
|
| 536 |
+
- If popups block the screen, try `press("enter")` or `press("escape")`.
|
| 537 |
+
|
| 538 |
+
### 6. Security & Privacy
|
| 539 |
+
- Don't attempt to bypass captchas or 2FA automatically.
|
| 540 |
+
- Don't store credentials in plain text unless instructed.
|
| 541 |
+
|
| 542 |
+
### 7. Final Answer
|
| 543 |
+
- When the form is successfully submitted or the goal achieved, use:
|
| 544 |
+
```python
|
| 545 |
+
final_answer("Done")
|
| 546 |
+
```<end_code>
|
| 547 |
+
</general_guidelines>
|
| 548 |
+
"""
|
cua2-core/src/cua2-core/services/models/anthropic.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from smolagents import LiteLLMModel
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class AnthropicModel(LiteLLMModel):
|
| 5 |
+
"""Anthropic model"""
|
| 6 |
+
|
| 7 |
+
MODEL_TYPE = "anthropic"
|
| 8 |
+
|
| 9 |
+
def __init__(self, model_id: str):
|
| 10 |
+
super().__init__(model_id=model_id)
|
cua2-core/src/cua2-core/services/models/gemini.py
ADDED
|
File without changes
|
cua2-core/src/cua2-core/services/models/get_model.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from smolagents import Model
|
| 2 |
+
|
| 3 |
+
from backend.models.models import AgentType
|
| 4 |
+
from backend.services.models.anthropic import AnthropicModel
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def get_model(model_id: str) -> tuple[Model, AgentType]:
|
| 8 |
+
"""Get the model"""
|
| 9 |
+
if "sonnet" in model_id:
|
| 10 |
+
return AnthropicModel(model_id=model_id), AgentType.PIXEL_COORDINATES
|
| 11 |
+
else:
|
| 12 |
+
raise ValueError(f"Model {model_id} not found")
|
cua2-core/src/cua2-core/services/models/qwen.py
ADDED
|
File without changes
|
cua2-core/src/cua2-core/websocket/websocket_manager.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import json
|
| 3 |
+
from typing import Dict, Optional, Set
|
| 4 |
+
|
| 5 |
+
from fastapi import WebSocket
|
| 6 |
+
|
| 7 |
+
from backend.models.models import AgentMetadata, WebSocketEvent
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class WebSocketManager:
|
| 11 |
+
"""Manages WebSocket connections and broadcasting"""
|
| 12 |
+
|
| 13 |
+
def __init__(self):
|
| 14 |
+
self.active_connections: Set[WebSocket] = set()
|
| 15 |
+
self.connection_tasks: Dict[WebSocket, asyncio.Task] = {}
|
| 16 |
+
|
| 17 |
+
async def connect(self, websocket: WebSocket):
|
| 18 |
+
"""Accept a new WebSocket connection"""
|
| 19 |
+
await websocket.accept()
|
| 20 |
+
self.active_connections.add(websocket)
|
| 21 |
+
print(f"WebSocket connected. Total connections: {len(self.active_connections)}")
|
| 22 |
+
|
| 23 |
+
def disconnect(self, websocket: WebSocket):
|
| 24 |
+
"""Remove a WebSocket connection"""
|
| 25 |
+
self.active_connections.discard(websocket)
|
| 26 |
+
if websocket in self.connection_tasks:
|
| 27 |
+
self.connection_tasks[websocket].cancel()
|
| 28 |
+
del self.connection_tasks[websocket]
|
| 29 |
+
print(
|
| 30 |
+
f"WebSocket disconnected. Total connections: {len(self.active_connections)}"
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
async def send_personal_message(
|
| 34 |
+
self, message: WebSocketEvent, websocket: WebSocket
|
| 35 |
+
):
|
| 36 |
+
"""Send a message to a specific WebSocket connection"""
|
| 37 |
+
try:
|
| 38 |
+
await websocket.send_text(json.dumps(message.model_dump()))
|
| 39 |
+
except Exception as e:
|
| 40 |
+
print(f"Error sending personal message: {e}")
|
| 41 |
+
# Only disconnect if the connection is still in our set
|
| 42 |
+
if websocket in self.active_connections:
|
| 43 |
+
self.disconnect(websocket)
|
| 44 |
+
|
| 45 |
+
async def broadcast(self, message: WebSocketEvent):
|
| 46 |
+
"""Broadcast a message to all connected WebSockets"""
|
| 47 |
+
if not self.active_connections:
|
| 48 |
+
return
|
| 49 |
+
|
| 50 |
+
# Create a list of connections to remove if they fail
|
| 51 |
+
disconnected = []
|
| 52 |
+
|
| 53 |
+
for connection in self.active_connections.copy():
|
| 54 |
+
try:
|
| 55 |
+
await connection.send_text(json.dumps(message.model_dump()))
|
| 56 |
+
except Exception as e:
|
| 57 |
+
print(f"Error broadcasting to connection: {e}")
|
| 58 |
+
disconnected.append(connection)
|
| 59 |
+
|
| 60 |
+
# Remove failed connections
|
| 61 |
+
for connection in disconnected:
|
| 62 |
+
if connection in self.active_connections:
|
| 63 |
+
self.disconnect(connection)
|
| 64 |
+
|
| 65 |
+
async def send_agent_start(self, content: str, message_id: str):
|
| 66 |
+
"""Send agent start event"""
|
| 67 |
+
event = WebSocketEvent(
|
| 68 |
+
type="agent_start", content=content, messageId=message_id
|
| 69 |
+
)
|
| 70 |
+
await self.broadcast(event)
|
| 71 |
+
|
| 72 |
+
async def send_agent_progress(self, content: str, message_id: str):
|
| 73 |
+
"""Send agent progress event"""
|
| 74 |
+
event = WebSocketEvent(
|
| 75 |
+
type="agent_progress", content=content, messageId=message_id
|
| 76 |
+
)
|
| 77 |
+
await self.broadcast(event)
|
| 78 |
+
|
| 79 |
+
async def send_agent_complete(
|
| 80 |
+
self, content: str, message_id: str, metadata: Optional[AgentMetadata] = None
|
| 81 |
+
):
|
| 82 |
+
"""Send agent complete event"""
|
| 83 |
+
event = WebSocketEvent(
|
| 84 |
+
type="agent_complete",
|
| 85 |
+
content=content,
|
| 86 |
+
messageId=message_id,
|
| 87 |
+
metadata=metadata,
|
| 88 |
+
)
|
| 89 |
+
await self.broadcast(event)
|
| 90 |
+
|
| 91 |
+
async def send_agent_error(self, content: str, message_id: Optional[str] = None):
|
| 92 |
+
"""Send agent error event"""
|
| 93 |
+
event = WebSocketEvent(
|
| 94 |
+
type="agent_error", content=content, messageId=message_id
|
| 95 |
+
)
|
| 96 |
+
await self.broadcast(event)
|
| 97 |
+
|
| 98 |
+
async def send_vnc_url_set(self, vnc_url: str, content: Optional[str] = None):
|
| 99 |
+
"""Send VNC URL set event"""
|
| 100 |
+
event = WebSocketEvent(
|
| 101 |
+
type="vnc_url_set",
|
| 102 |
+
content=content or f"VNC stream available at: {vnc_url}",
|
| 103 |
+
vncUrl=vnc_url,
|
| 104 |
+
)
|
| 105 |
+
await self.broadcast(event)
|
| 106 |
+
|
| 107 |
+
async def send_vnc_url_unset(self, content: Optional[str] = None):
|
| 108 |
+
"""Send VNC URL unset event (reset to default display)"""
|
| 109 |
+
event = WebSocketEvent(
|
| 110 |
+
type="vnc_url_unset",
|
| 111 |
+
content=content or "VNC stream disconnected, showing default display",
|
| 112 |
+
)
|
| 113 |
+
await self.broadcast(event)
|
| 114 |
+
|
| 115 |
+
def get_connection_count(self) -> int:
|
| 116 |
+
"""Get the number of active connections"""
|
| 117 |
+
return len(self.active_connections)
|
cua2-front/.gitignore
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Logs
|
| 2 |
+
logs
|
| 3 |
+
*.log
|
| 4 |
+
npm-debug.log*
|
| 5 |
+
yarn-debug.log*
|
| 6 |
+
yarn-error.log*
|
| 7 |
+
pnpm-debug.log*
|
| 8 |
+
lerna-debug.log*
|
| 9 |
+
|
| 10 |
+
node_modules
|
| 11 |
+
dist
|
| 12 |
+
dist-ssr
|
| 13 |
+
*.local
|
| 14 |
+
|
| 15 |
+
# Editor directories and files
|
| 16 |
+
.vscode/*
|
| 17 |
+
!.vscode/extensions.json
|
| 18 |
+
.idea
|
| 19 |
+
.DS_Store
|
| 20 |
+
*.suo
|
| 21 |
+
*.ntvs*
|
| 22 |
+
*.njsproj
|
| 23 |
+
*.sln
|
| 24 |
+
*.sw?
|
cua2-front/index.html
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!doctype html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<link rel="icon" type="image/svg+xml" href="/favicon.ico" />
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 7 |
+
<title>CUA2</title>
|
| 8 |
+
</head>
|
| 9 |
+
<body>
|
| 10 |
+
<div id="root"></div>
|
| 11 |
+
<script type="module" src="/src/main.tsx"></script>
|
| 12 |
+
</body>
|
| 13 |
+
</html>
|
| 14 |
+
|
cua2-front/package-lock.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
cua2-front/package.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "cua2-front",
|
| 3 |
+
"private": true,
|
| 4 |
+
"version": "0.0.0",
|
| 5 |
+
"type": "module",
|
| 6 |
+
"scripts": {
|
| 7 |
+
"dev": "vite",
|
| 8 |
+
"build": "vite build",
|
| 9 |
+
"build:dev": "vite build --mode development",
|
| 10 |
+
"lint": "eslint src/ --config src/eslint.config.js",
|
| 11 |
+
"type-check": "tsc --noEmit --project src/tsconfig.json",
|
| 12 |
+
"preview": "vite preview"
|
| 13 |
+
},
|
| 14 |
+
"dependencies": {
|
| 15 |
+
"react": "^18.3.1",
|
| 16 |
+
"react-router-dom": "^6.30.1",
|
| 17 |
+
"react-dom": "^18.3.1"
|
| 18 |
+
},
|
| 19 |
+
"devDependencies": {
|
| 20 |
+
"@eslint/js": "^9.32.0",
|
| 21 |
+
"@types/node": "^22.16.5",
|
| 22 |
+
"@types/react": "^18.3.23",
|
| 23 |
+
"@types/react-dom": "^18.3.7",
|
| 24 |
+
"@vitejs/plugin-react-swc": "^3.11.0",
|
| 25 |
+
"autoprefixer": "^10.4.21",
|
| 26 |
+
"eslint": "^9.32.0",
|
| 27 |
+
"eslint-plugin-react-hooks": "^5.2.0",
|
| 28 |
+
"eslint-plugin-react-refresh": "^0.4.20",
|
| 29 |
+
"globals": "^15.15.0",
|
| 30 |
+
"typescript-eslint": "^8.38.0",
|
| 31 |
+
"vite": "^5.4.19"
|
| 32 |
+
}
|
| 33 |
+
}
|
cua2-front/src/App.tsx
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import React from 'react';
|
| 2 |
+
import { BrowserRouter, Routes, Route } from "react-router-dom";
|
| 3 |
+
import Index from "./pages/Index";
|
| 4 |
+
|
| 5 |
+
const App = () => (
|
| 6 |
+
<BrowserRouter>
|
| 7 |
+
<Routes>
|
| 8 |
+
<Route path="/" element={<Index />} />
|
| 9 |
+
{/* ADD ALL CUSTOM ROUTES ABOVE THE CATCH-ALL "*" ROUTE */}
|
| 10 |
+
</Routes>
|
| 11 |
+
</BrowserRouter>
|
| 12 |
+
|
| 13 |
+
);
|
| 14 |
+
|
| 15 |
+
export default App;
|
cua2-front/src/hooks/useWebSocket.ts
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { WebSocketEvent } from '@/types/agent';
|
| 2 |
+
import { useCallback, useEffect, useRef, useState } from 'react';
|
| 3 |
+
|
| 4 |
+
interface UseWebSocketProps {
|
| 5 |
+
url: string;
|
| 6 |
+
onMessage: (event: WebSocketEvent) => void;
|
| 7 |
+
onError?: (error: Event) => void;
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
export const useWebSocket = ({ url, onMessage, onError }: UseWebSocketProps) => {
|
| 11 |
+
const [isConnected, setIsConnected] = useState(false);
|
| 12 |
+
const [connectionState, setConnectionState] = useState<'connecting' | 'connected' | 'disconnected' | 'error'>('disconnected');
|
| 13 |
+
const wsRef = useRef<WebSocket | null>(null);
|
| 14 |
+
const reconnectTimeoutRef = useRef<NodeJS.Timeout>();
|
| 15 |
+
const reconnectAttemptsRef = useRef(0);
|
| 16 |
+
const maxReconnectAttempts = 3; // Only try three times, then stop
|
| 17 |
+
const baseReconnectDelay = 3000; // Start with 3 seconds
|
| 18 |
+
const maxReconnectDelay = 5000; // Max 5 seconds
|
| 19 |
+
const lastErrorTimeRef = useRef(0);
|
| 20 |
+
const errorThrottleMs = 5000; // Only show error toast once every 5 seconds
|
| 21 |
+
const isInitialConnectionRef = useRef(true); // Track if this is the first connection attempt
|
| 22 |
+
|
| 23 |
+
const getReconnectDelay = () => {
|
| 24 |
+
// Exponential backoff with jitter
|
| 25 |
+
const delay = Math.min(
|
| 26 |
+
baseReconnectDelay * Math.pow(2, reconnectAttemptsRef.current),
|
| 27 |
+
maxReconnectDelay
|
| 28 |
+
);
|
| 29 |
+
return delay + Math.random() * 1000; // Add jitter
|
| 30 |
+
};
|
| 31 |
+
|
| 32 |
+
const connect = useCallback(() => {
|
| 33 |
+
if (wsRef.current?.readyState === WebSocket.OPEN || wsRef.current?.readyState === WebSocket.CONNECTING) {
|
| 34 |
+
return; // Already connected or connecting
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
try {
|
| 38 |
+
setConnectionState('connecting');
|
| 39 |
+
const ws = new WebSocket(url);
|
| 40 |
+
|
| 41 |
+
ws.onopen = () => {
|
| 42 |
+
console.log('WebSocket connected');
|
| 43 |
+
setIsConnected(true);
|
| 44 |
+
setConnectionState('connected');
|
| 45 |
+
reconnectAttemptsRef.current = 0; // Reset attempts on successful connection
|
| 46 |
+
isInitialConnectionRef.current = false; // Mark that we've had a successful connection
|
| 47 |
+
};
|
| 48 |
+
|
| 49 |
+
ws.onmessage = (event) => {
|
| 50 |
+
try {
|
| 51 |
+
const data = JSON.parse(event.data) as WebSocketEvent;
|
| 52 |
+
onMessage(data);
|
| 53 |
+
} catch (error) {
|
| 54 |
+
console.error('Failed to parse WebSocket message:', error);
|
| 55 |
+
}
|
| 56 |
+
};
|
| 57 |
+
|
| 58 |
+
ws.onerror = (error) => {
|
| 59 |
+
console.error('WebSocket error:', error);
|
| 60 |
+
setConnectionState('error');
|
| 61 |
+
|
| 62 |
+
// Don't show error toasts on initial connection failure
|
| 63 |
+
// Only show toasts after we've had a successful connection before
|
| 64 |
+
if (!isInitialConnectionRef.current) {
|
| 65 |
+
// Throttle error notifications
|
| 66 |
+
const now = Date.now();
|
| 67 |
+
if (now - lastErrorTimeRef.current > errorThrottleMs) {
|
| 68 |
+
lastErrorTimeRef.current = now;
|
| 69 |
+
onError?.(error);
|
| 70 |
+
}
|
| 71 |
+
}
|
| 72 |
+
};
|
| 73 |
+
|
| 74 |
+
ws.onclose = (event) => {
|
| 75 |
+
console.log('WebSocket disconnected', { code: event.code, reason: event.reason });
|
| 76 |
+
setIsConnected(false);
|
| 77 |
+
setConnectionState('disconnected');
|
| 78 |
+
|
| 79 |
+
// Only attempt to reconnect if it wasn't a manual close (code 1000) and we haven't exceeded max attempts
|
| 80 |
+
if (event.code !== 1000 && reconnectAttemptsRef.current < maxReconnectAttempts) {
|
| 81 |
+
const delay = getReconnectDelay();
|
| 82 |
+
console.log(`Attempting to reconnect in ${Math.round(delay)}ms (attempt ${reconnectAttemptsRef.current + 1}/${maxReconnectAttempts})`);
|
| 83 |
+
|
| 84 |
+
reconnectTimeoutRef.current = setTimeout(() => {
|
| 85 |
+
reconnectAttemptsRef.current++;
|
| 86 |
+
connect();
|
| 87 |
+
}, delay);
|
| 88 |
+
} else if (reconnectAttemptsRef.current >= maxReconnectAttempts) {
|
| 89 |
+
console.log('Max reconnection attempts reached');
|
| 90 |
+
setConnectionState('error');
|
| 91 |
+
} else if (event.code === 1000) {
|
| 92 |
+
// Normal closure - don't reconnect
|
| 93 |
+
setConnectionState('disconnected');
|
| 94 |
+
console.log('WebSocket closed normally, not reconnecting');
|
| 95 |
+
}
|
| 96 |
+
};
|
| 97 |
+
|
| 98 |
+
wsRef.current = ws;
|
| 99 |
+
} catch (error) {
|
| 100 |
+
console.error('Failed to create WebSocket connection:', error);
|
| 101 |
+
setConnectionState('error');
|
| 102 |
+
}
|
| 103 |
+
}, [url, onMessage, onError]);
|
| 104 |
+
|
| 105 |
+
const disconnect = useCallback(() => {
|
| 106 |
+
if (reconnectTimeoutRef.current) {
|
| 107 |
+
clearTimeout(reconnectTimeoutRef.current);
|
| 108 |
+
}
|
| 109 |
+
if (wsRef.current) {
|
| 110 |
+
wsRef.current.close(1000, 'Manual disconnect');
|
| 111 |
+
wsRef.current = null;
|
| 112 |
+
}
|
| 113 |
+
setIsConnected(false);
|
| 114 |
+
setConnectionState('disconnected');
|
| 115 |
+
reconnectAttemptsRef.current = 0;
|
| 116 |
+
}, []);
|
| 117 |
+
|
| 118 |
+
const manualReconnect = useCallback(() => {
|
| 119 |
+
console.log('Manual reconnect requested');
|
| 120 |
+
disconnect();
|
| 121 |
+
reconnectAttemptsRef.current = 0;
|
| 122 |
+
isInitialConnectionRef.current = false; // Allow error toasts on manual reconnect
|
| 123 |
+
setTimeout(() => connect(), 1000); // Small delay before reconnecting
|
| 124 |
+
}, [disconnect, connect]);
|
| 125 |
+
|
| 126 |
+
const sendMessage = (message: unknown) => {
|
| 127 |
+
if (wsRef.current?.readyState === WebSocket.OPEN) {
|
| 128 |
+
try {
|
| 129 |
+
wsRef.current.send(JSON.stringify(message));
|
| 130 |
+
} catch (error) {
|
| 131 |
+
console.error('Failed to send WebSocket message:', error);
|
| 132 |
+
}
|
| 133 |
+
} else {
|
| 134 |
+
console.warn('WebSocket is not connected');
|
| 135 |
+
}
|
| 136 |
+
};
|
| 137 |
+
|
| 138 |
+
useEffect(() => {
|
| 139 |
+
connect();
|
| 140 |
+
|
| 141 |
+
return () => {
|
| 142 |
+
disconnect();
|
| 143 |
+
};
|
| 144 |
+
}, [url]); // Only depend on url, not the functions
|
| 145 |
+
|
| 146 |
+
return {
|
| 147 |
+
isConnected,
|
| 148 |
+
connectionState,
|
| 149 |
+
sendMessage,
|
| 150 |
+
reconnect: connect,
|
| 151 |
+
disconnect,
|
| 152 |
+
manualReconnect
|
| 153 |
+
};
|
| 154 |
+
};
|
cua2-front/src/index.css
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
* {
|
| 2 |
+
margin: 0;
|
| 3 |
+
padding: 0;
|
| 4 |
+
box-sizing: border-box;
|
| 5 |
+
}
|
| 6 |
+
|
| 7 |
+
body {
|
| 8 |
+
margin: 0;
|
| 9 |
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
|
| 10 |
+
'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
|
| 11 |
+
sans-serif;
|
| 12 |
+
-webkit-font-smoothing: antialiased;
|
| 13 |
+
-moz-osx-font-smoothing: grayscale;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
#root {
|
| 17 |
+
width: 100%;
|
| 18 |
+
height: 100vh;
|
| 19 |
+
}
|
| 20 |
+
|
cua2-front/src/main.tsx
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { createRoot } from "react-dom/client";
|
| 2 |
+
import App from "./App.tsx";
|
| 3 |
+
import "./index.css";
|
| 4 |
+
|
| 5 |
+
createRoot(document.getElementById("root")!).render(<App />);
|
cua2-front/src/pages/Index.tsx
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import React from 'react';
|
| 2 |
+
import { useWebSocket } from '@/hooks/useWebSocket';
|
| 3 |
+
import { AgentMessage, WebSocketEvent } from '@/types/agent';
|
| 4 |
+
import { useEffect, useState } from 'react';
|
| 5 |
+
|
| 6 |
+
const Index = () => {
|
| 7 |
+
const [messages, setMessages] = useState<AgentMessage[]>([]);
|
| 8 |
+
const [isAgentProcessing, setIsAgentProcessing] = useState(false);
|
| 9 |
+
const [vncUrl, setVncUrl] = useState<string>('');
|
| 10 |
+
|
| 11 |
+
// WebSocket connection - Use environment variable for flexibility across environments
|
| 12 |
+
// const WS_URL = process.env.NEXT_PUBLIC_WS_URL || 'ws://localhost:8000/ws';
|
| 13 |
+
const WS_URL = 'ws://localhost:8000/ws';
|
| 14 |
+
|
| 15 |
+
const handleWebSocketMessage = (event: WebSocketEvent) => {
|
| 16 |
+
console.log('WebSocket event received:', event);
|
| 17 |
+
|
| 18 |
+
switch (event.type) {
|
| 19 |
+
case 'agent_start':
|
| 20 |
+
setIsAgentProcessing(true);
|
| 21 |
+
if (event.content) {
|
| 22 |
+
const newMessage: AgentMessage = {
|
| 23 |
+
id: event.messageId,
|
| 24 |
+
type: 'agent',
|
| 25 |
+
instructions: event.instructions,
|
| 26 |
+
modelId: event.modelId,
|
| 27 |
+
timestamp: new Date(),
|
| 28 |
+
isLoading: true,
|
| 29 |
+
};
|
| 30 |
+
setMessages(prev => [...prev, newMessage]);
|
| 31 |
+
}
|
| 32 |
+
break;
|
| 33 |
+
|
| 34 |
+
case 'agent_progress':
|
| 35 |
+
if (event.messageId && event.agentStep) {
|
| 36 |
+
// Add new step from a agent trace run with image, generated text, actions, tokens and timestamp
|
| 37 |
+
setMessages(prev =>
|
| 38 |
+
prev.map(msg => {
|
| 39 |
+
if (msg.id === event.agentStep.messageId) {
|
| 40 |
+
const existingSteps = msg.steps || [];
|
| 41 |
+
const stepExists = existingSteps.some(step => step.stepId === event.agentStep.stepId);
|
| 42 |
+
|
| 43 |
+
if (!stepExists) {
|
| 44 |
+
return { ...msg, steps: [...existingSteps, event.agentStep], isLoading: true };
|
| 45 |
+
}
|
| 46 |
+
return msg;
|
| 47 |
+
}
|
| 48 |
+
return msg;
|
| 49 |
+
})
|
| 50 |
+
);
|
| 51 |
+
}
|
| 52 |
+
break;
|
| 53 |
+
|
| 54 |
+
case 'agent_complete':
|
| 55 |
+
setIsAgentProcessing(false);
|
| 56 |
+
if (event.messageId && event.metadata) {
|
| 57 |
+
setMessages(prev =>
|
| 58 |
+
prev.map(msg =>
|
| 59 |
+
msg.id === event.metadata.messageId
|
| 60 |
+
? {
|
| 61 |
+
...msg,
|
| 62 |
+
isLoading: false,
|
| 63 |
+
metadata: event.metadata,
|
| 64 |
+
}
|
| 65 |
+
: msg
|
| 66 |
+
)
|
| 67 |
+
);
|
| 68 |
+
}
|
| 69 |
+
break;
|
| 70 |
+
|
| 71 |
+
case 'agent_error':
|
| 72 |
+
setIsAgentProcessing(false);
|
| 73 |
+
// TODO: Handle agent error
|
| 74 |
+
break;
|
| 75 |
+
|
| 76 |
+
case 'vnc_url_set':
|
| 77 |
+
if (event.vncUrl) {
|
| 78 |
+
setVncUrl(event.vncUrl);
|
| 79 |
+
}
|
| 80 |
+
// TODO: Handle VNC URL set
|
| 81 |
+
break;
|
| 82 |
+
|
| 83 |
+
case 'vnc_url_unset':
|
| 84 |
+
setVncUrl('');
|
| 85 |
+
// TODO: Handle VNC URL unset
|
| 86 |
+
break;
|
| 87 |
+
|
| 88 |
+
case 'heartbeat':
|
| 89 |
+
console.log('Heartbeat received:', event);
|
| 90 |
+
break;
|
| 91 |
+
}
|
| 92 |
+
};
|
| 93 |
+
|
| 94 |
+
const handleWebSocketError = () => {
|
| 95 |
+
// Error handling is now throttled in the WebSocket hook
|
| 96 |
+
|
| 97 |
+
};
|
| 98 |
+
|
| 99 |
+
const { isConnected, connectionState, sendMessage, manualReconnect } = useWebSocket({
|
| 100 |
+
url: WS_URL,
|
| 101 |
+
onMessage: handleWebSocketMessage,
|
| 102 |
+
onError: handleWebSocketError,
|
| 103 |
+
});
|
| 104 |
+
|
| 105 |
+
const handleSendMessage = (content: string) => {
|
| 106 |
+
const userMessage: AgentMessage = {
|
| 107 |
+
id: Date.now().toString(),
|
| 108 |
+
type: 'user',
|
| 109 |
+
content,
|
| 110 |
+
timestamp: new Date(),
|
| 111 |
+
};
|
| 112 |
+
|
| 113 |
+
setMessages(prev => [...prev, userMessage]);
|
| 114 |
+
|
| 115 |
+
// Send message to Python backend via WebSocket
|
| 116 |
+
sendMessage({
|
| 117 |
+
type: 'user_task',
|
| 118 |
+
content,
|
| 119 |
+
model_id: "anthropic/claude-sonnet-4-5-20250929",
|
| 120 |
+
timestamp: new Date().toISOString(),
|
| 121 |
+
});
|
| 122 |
+
};
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
return (
|
| 126 |
+
<div>
|
| 127 |
+
<h1>Hello World</h1>
|
| 128 |
+
</div>
|
| 129 |
+
);
|
| 130 |
+
};
|
| 131 |
+
|
| 132 |
+
export default Index;
|
cua2-front/src/types/agent.ts
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
export interface AgentMessage {
|
| 2 |
+
id: string;
|
| 3 |
+
type: 'user' | 'agent';
|
| 4 |
+
timestamp: Date;
|
| 5 |
+
instructions: string;
|
| 6 |
+
modelId: string;
|
| 7 |
+
steps?: AgentStep[];
|
| 8 |
+
metadata?: AgentMetadata;
|
| 9 |
+
isLoading?: boolean;
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
export interface AgentStep {
|
| 13 |
+
messageId: string;
|
| 14 |
+
stepId: string;
|
| 15 |
+
image: string;
|
| 16 |
+
generatedText: string;
|
| 17 |
+
actions: string[];
|
| 18 |
+
inputTokensUsed: number;
|
| 19 |
+
outputTokensUsed: number;
|
| 20 |
+
timestamp: Date;
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
export interface AgentMetadata {
|
| 24 |
+
messageId: string;
|
| 25 |
+
inputTokensUsed: number;
|
| 26 |
+
outputTokensUsed: number;
|
| 27 |
+
timeTaken: number;
|
| 28 |
+
numberOfSteps: number;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
export interface WebSocketEvent {
|
| 32 |
+
type: 'agent_start' | 'agent_progress' | 'agent_complete' | 'agent_error' | 'vnc_url_set' | 'vnc_url_unset' | 'heartbeat';
|
| 33 |
+
agentStep?: AgentStep;
|
| 34 |
+
metadata?: AgentMetadata;
|
| 35 |
+
vncUrl?: string;
|
| 36 |
+
}
|
cua2-front/src/vite-env.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
/// <reference types="vite/client" />
|
cua2-front/tsconfig.app.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"compilerOptions": {
|
| 3 |
+
"target": "ES2020",
|
| 4 |
+
"useDefineForClassFields": true,
|
| 5 |
+
"lib": [
|
| 6 |
+
"ES2020",
|
| 7 |
+
"DOM",
|
| 8 |
+
"DOM.Iterable"
|
| 9 |
+
],
|
| 10 |
+
"module": "ESNext",
|
| 11 |
+
"skipLibCheck": true,
|
| 12 |
+
/* Bundler mode */
|
| 13 |
+
"moduleResolution": "bundler",
|
| 14 |
+
"allowImportingTsExtensions": true,
|
| 15 |
+
"isolatedModules": true,
|
| 16 |
+
"moduleDetection": "force",
|
| 17 |
+
"noEmit": true,
|
| 18 |
+
"jsx": "react-jsx",
|
| 19 |
+
/* Linting */
|
| 20 |
+
"strict": false,
|
| 21 |
+
"noUnusedLocals": false,
|
| 22 |
+
"noUnusedParameters": false,
|
| 23 |
+
"noImplicitAny": false,
|
| 24 |
+
"noFallthroughCasesInSwitch": false,
|
| 25 |
+
"baseUrl": ".",
|
| 26 |
+
"paths": {
|
| 27 |
+
"@/*": [
|
| 28 |
+
"./src/*"
|
| 29 |
+
]
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"include": [
|
| 33 |
+
"src",
|
| 34 |
+
]
|
| 35 |
+
}
|
cua2-front/tsconfig.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"files": [],
|
| 3 |
+
"references": [{ "path": "./tsconfig.app.json" }, { "path": "./tsconfig.node.json" }],
|
| 4 |
+
"compilerOptions": {
|
| 5 |
+
"baseUrl": ".",
|
| 6 |
+
"paths": {
|
| 7 |
+
"@/*": ["./src/*"]
|
| 8 |
+
},
|
| 9 |
+
"noImplicitAny": false,
|
| 10 |
+
"noUnusedParameters": false,
|
| 11 |
+
"skipLibCheck": true,
|
| 12 |
+
"allowJs": true,
|
| 13 |
+
"noUnusedLocals": false,
|
| 14 |
+
"strictNullChecks": false
|
| 15 |
+
}
|
| 16 |
+
}
|
cua2-front/tsconfig.node.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"compilerOptions": {
|
| 3 |
+
"target": "ES2022",
|
| 4 |
+
"lib": ["ES2023"],
|
| 5 |
+
"module": "ESNext",
|
| 6 |
+
"skipLibCheck": true,
|
| 7 |
+
|
| 8 |
+
/* Bundler mode */
|
| 9 |
+
"moduleResolution": "bundler",
|
| 10 |
+
"allowImportingTsExtensions": true,
|
| 11 |
+
"isolatedModules": true,
|
| 12 |
+
"moduleDetection": "force",
|
| 13 |
+
"noEmit": true,
|
| 14 |
+
|
| 15 |
+
/* Linting */
|
| 16 |
+
"strict": true,
|
| 17 |
+
"noUnusedLocals": false,
|
| 18 |
+
"noUnusedParameters": false,
|
| 19 |
+
"noFallthroughCasesInSwitch": true
|
| 20 |
+
},
|
| 21 |
+
"include": ["vite.config.ts"]
|
| 22 |
+
}
|
cua2-front/vite.config.ts
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { defineConfig } from "vite";
|
| 2 |
+
import react from "@vitejs/plugin-react-swc";
|
| 3 |
+
import path from "path";
|
| 4 |
+
|
| 5 |
+
// https://vitejs.dev/config/
|
| 6 |
+
export default defineConfig(({ mode }) => ({
|
| 7 |
+
server: {
|
| 8 |
+
host: "::",
|
| 9 |
+
port: 8080,
|
| 10 |
+
},
|
| 11 |
+
plugins: [react()],
|
| 12 |
+
resolve: {
|
| 13 |
+
alias: {
|
| 14 |
+
"@": path.resolve(__dirname, "./src"),
|
| 15 |
+
},
|
| 16 |
+
},
|
| 17 |
+
}));
|