Spaces:

smolagents
/

computer-use-agent

Running

App Files Files Community

Amir Mahla commited on Oct 16

Commit

af1ae43

0 Parent(s):

Init CUA2

Browse files

Files changed (36) hide show

.gitignore +226 -0
cua2-core/env.example +11 -0
cua2-core/pyproject.toml +93 -0
cua2-core/src/__init__.py +1 -0
cua2-core/src/cua2-core/__init__.py +1 -0
cua2-core/src/cua2-core/app.py +64 -0
cua2-core/src/cua2-core/main.py +37 -0
cua2-core/src/cua2-core/models/models.py +95 -0
cua2-core/src/cua2-core/routes/routes.py +56 -0
cua2-core/src/cua2-core/routes/websocket.py +86 -0
cua2-core/src/cua2-core/services/agent_service.py +130 -0
cua2-core/src/cua2-core/services/agents/get_agents.py +57 -0
cua2-core/src/cua2-core/services/agents/normalized_1000_agent.py +293 -0
cua2-core/src/cua2-core/services/agents/normalized_agent.py +282 -0
cua2-core/src/cua2-core/services/agents/pixel_coordonates_agent.py +317 -0
cua2-core/src/cua2-core/services/agents/prompt.py +548 -0
cua2-core/src/cua2-core/services/models/anthropic.py +10 -0
cua2-core/src/cua2-core/services/models/gemini.py +0 -0
cua2-core/src/cua2-core/services/models/get_model.py +12 -0
cua2-core/src/cua2-core/services/models/qwen.py +0 -0
cua2-core/src/cua2-core/websocket/websocket_manager.py +117 -0
cua2-front/.gitignore +24 -0
cua2-front/index.html +14 -0
cua2-front/package-lock.json +0 -0
cua2-front/package.json +33 -0
cua2-front/src/App.tsx +15 -0
cua2-front/src/hooks/useWebSocket.ts +154 -0
cua2-front/src/index.css +20 -0
cua2-front/src/main.tsx +5 -0
cua2-front/src/pages/Index.tsx +132 -0
cua2-front/src/types/agent.ts +36 -0
cua2-front/src/vite-env.d.ts +1 -0
cua2-front/tsconfig.app.json +35 -0
cua2-front/tsconfig.json +16 -0
cua2-front/tsconfig.node.json +22 -0
cua2-front/vite.config.ts +17 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,226 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the enitre vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+gui_agent_demo.mp4
+recording.mp4
+uv.lock
+.DS_Store
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+node_modules
+dist
+dist-ssr
+*.local
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?

cua2-core/env.example ADDED Viewed

	@@ -0,0 +1,11 @@

+# Environment Configuration
+HOST=0.0.0.0
+PORT=8000
+DEBUG=true
+# Agent Configuration
+AGENT_TIMEOUT=300
+MAX_CONCURRENT_TASKS=5
+# Logging
+LOG_LEVEL=INFO

cua2-core/pyproject.toml ADDED Viewed

	@@ -0,0 +1,93 @@

+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "cua2-core"
+version = "0.0.0-dev.0"
+description = "Backend API server for Computer Use Agent"
+readme = "README.md"
+authors = [{ name = "Amir Mahla", email = "[email protected]" }]
+keywords = ["fastapi", "api", "backend", "automation"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Internet :: WWW/HTTP :: HTTP Servers",
+    "Topic :: Software Development :: Libraries :: Application Frameworks",
+]
+requires-python = ">=3.10"
+dependencies = [
+    "fastapi>=0.115.13",
+    "uvicorn[standard]>=0.29.0,<0.30.0",
+    "websockets>=13.1.0,<14.0.0",
+    "pydantic>=2.11.7",
+    "python-multipart>=0.0.18,<0.0.19",
+    "python-jose[cryptography]==3.3.0",
+    "passlib[bcrypt]==1.7.4",
+    "python-dotenv==1.0.0",
+    "httpx>=0.27.1",
+    "asyncio-mqtt==0.16.1",
+    "aiofiles==23.2.1",
+    "smolagents[openai,litellm]==1.15.0",
+    "openai==1.91.0",
+    "litellm[proxy]==1.63.14",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.0.0",
+    "pytest-asyncio>=0.21.0",
+    "pytest-cov>=4.0.0",
+    "black>=23.0.0",
+    "isort>=5.12.0",
+    "flake8>=6.0.0",
+    "mypy>=1.0.0",
+    "pre-commit>=3.0.0",
+]
+test = [
+    "pytest>=7.0.0",
+    "pytest-asyncio>=0.21.0",
+    "pytest-cov>=4.0.0",
+]
+[project.urls]
+Homepage = "https://github.com/huggingface/CUA2"
+Repository = "https://github.com/huggingface/CUA2"
+[tool.hatch.build.targets.wheel]
+packages = ["src/cua2-core"]
+[tool.hatch.build.targets.sdist]
+include = [
+    "/src",
+    "/README.md",
+]
+[tool.coverage.run]
+source = ["src"]
+omit = [
+    "*/tests/*",
+    "*/test_*",
+    "*/__pycache__/*",
+    "*/migrations/*",
+]
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "def __repr__",
+    "if self.debug:",
+    "if settings.DEBUG",
+    "raise AssertionError",
+    "raise NotImplementedError",
+    "if 0:",
+    "if __name__ == .__main__.:",
+    "class .*\\bProtocol\\):",
+    "@(abc\\.)?abstractmethod",
+]

cua2-core/src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Backend package

cua2-core/src/cua2-core/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Backend package

cua2-core/src/cua2-core/app.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from contextlib import asynccontextmanager
+from dotenv import load_dotenv
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from backend.services.agent_service import AgentService
+from backend.websocket.websocket_manager import WebSocketManager
+# Load environment variables
+load_dotenv()
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Lifespan context manager for startup and shutdown events"""
+    # Startup: Initialize services
+    print("Initializing services...")
+    # Initialize WebSocket manager
+    websocket_manager = WebSocketManager()
+    # Initialize agent service with websocket manager dependency
+    agent_service = AgentService(websocket_manager)
+    # Store services in app state for access in routes
+    app.state.websocket_manager = websocket_manager
+    app.state.agent_service = agent_service
+    print("Services initialized successfully")
+    yield
+    # Shutdown: Clean up resources
+    print("Shutting down services...")
+    # Add any cleanup logic here if needed
+    print("Services shut down successfully")
+# Create FastAPI app with lifespan
+app = FastAPI(
+    title="Computer Use Studio Backend",
+    description="Backend API for Computer Use Studio - AI-powered automation interface",
+    version="1.0.0",
+    docs_url="/docs",
+    redoc_url="/redoc",
+    lifespan=lifespan,
+)
+# Configure CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[
+        "http://localhost:3000",  # React dev server
+        "http://localhost:5173",  # Vite dev server
+        "http://localhost:8080",  # Alternative frontend port
+        "http://127.0.0.1:3000",
+        "http://127.0.0.1:5173",
+        "http://127.0.0.1:8080",
+    ],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)

cua2-core/src/cua2-core/main.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+import uvicorn
+from backend.app import app
+from backend.routes.routes import router
+from backend.routes.websocket import router as websocket_router
+# Include routes
+app.include_router(router, prefix="/api/v1")
+app.include_router(websocket_router)
+# Health check endpoint (without prefix)
+@app.get("/health")
+async def health():
+    return {"status": "healthy", "service": "computer-use-studio-backend"}
+if __name__ == "__main__":
+    # Get configuration from environment variables
+    host = os.getenv("HOST", "0.0.0.0")
+    port = int(os.getenv("PORT", 8000))
+    debug = os.getenv("DEBUG", "false").lower() == "true"
+    print(f"Starting Computer Use Studio Backend on {host}:{port}")
+    print(f"Debug mode: {debug}")
+    print(f"API Documentation: http://{host}:{port}/docs")
+    print(f"WebSocket endpoint: ws://{host}:{port}/ws")
+    uvicorn.run(
+        "backend.app:app",
+        host=host,
+        port=port,
+        reload=debug,
+        log_level="info" if not debug else "debug",
+    )

cua2-core/src/cua2-core/models/models.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import json
+import os
+from datetime import datetime
+from enum import Enum
+from typing import Literal, Optional
+from pydantic import BaseModel, model_validator
+class AgentMetadata(BaseModel):
+    """Metadata for agent execution"""
+    inputTokensUsed: int
+    outputTokensUsed: int
+    timeTaken: float  # in seconds
+    numberOfSteps: int
+class AgentType(str, Enum):
+    """Agent type"""
+    PIXEL_COORDINATES = "pixel_coordinates"
+    NORMALIZED_1000_COORDINATES = "normalized_1000_coordinates"
+    NORMALIZED_COORDINATES = "normalized_coordinates"
+class ActiveTask(BaseModel):
+    """Active task"""
+    message_id: str
+    content: str
+    model_id: str
+    start_time: datetime
+    status: str
+    @property
+    def trace_path(self):
+        """Trace path"""
+        return f"data/trace-{self.message_id}-{self.model_id}"
+    @model_validator(mode="after")
+    def validate_model_id(self):
+        """Validate model ID"""
+        os.makedirs(self.trace_path, exist_ok=True)
+        with open(f"{self.trace_path}/user_tasks.json", "w") as f:
+            json.dump(self.model_dump(mode="json"), f, indent=2)
+        return self
+class WebSocketEvent(BaseModel):
+    """WebSocket event structure"""
+    type: Literal[
+        "agent_start",
+        "agent_progress",
+        "agent_complete",
+        "agent_error",
+        "vnc_url_set",
+        "vnc_url_unset",
+        "heartbeat",
+    ]
+    content: Optional[str] = None
+    metadata: Optional[AgentMetadata] = None
+    messageId: Optional[str] = None
+    vncUrl: Optional[str] = None
+class UserTaskMessage(BaseModel):
+    """Message sent from frontend to backend"""
+    type: Literal["user_task"]
+    content: str
+    model_id: str
+    timestamp: str
+class AgentMessage(BaseModel):
+    """Agent message structure"""
+    id: str
+    type: Literal["user", "agent"]
+    content: str
+    timestamp: datetime
+    metadata: Optional[AgentMetadata] = None
+    isLoading: Optional[bool] = None
+    truncated: Optional[bool] = None
+class HealthResponse(BaseModel):
+    """Health check response"""
+    status: str
+    timestamp: datetime
+    websocket_connections: int

cua2-core/src/cua2-core/routes/routes.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from datetime import datetime
+from fastapi import APIRouter, Depends, HTTPException, Request
+# Get services from app state
+from backend.models.models import HealthResponse
+from backend.services.agent_service import AgentService
+from backend.websocket.websocket_manager import WebSocketManager
+# Create router
+router = APIRouter()
+def get_websocket_manager(request: Request) -> WebSocketManager:
+    """Dependency to get WebSocket manager from app state"""
+    return request.app.state.websocket_manager
+def get_agent_service(request: Request) -> AgentService:
+    """Dependency to get agent service from app state"""
+    return request.app.state.agent_service
+@router.get("/health", response_model=HealthResponse)
+async def health_check(
+    websocket_manager: WebSocketManager = Depends(get_websocket_manager),
+):
+    """Health check endpoint"""
+    return HealthResponse(
+        status="healthy",
+        timestamp=datetime.now(),
+        websocket_connections=websocket_manager.get_connection_count(),
+    )
+@router.get("/tasks")
+async def get_active_tasks(
+    agent_service: AgentService = Depends(get_agent_service),
+    websocket_manager: WebSocketManager = Depends(get_websocket_manager),
+):
+    """Get currently active tasks"""
+    return {
+        "active_tasks": agent_service.get_active_tasks(),
+        "total_connections": websocket_manager.get_connection_count(),
+    }
+@router.get("/tasks/{task_id}")
+async def get_task_status(
+    task_id: str, agent_service: AgentService = Depends(get_agent_service)
+):
+    """Get status of a specific task"""
+    task_status = agent_service.get_task_status(task_id)
+    if task_status is None:
+        raise HTTPException(status_code=404, detail="Task not found")
+    return {"task_id": task_id, "status": task_status}

cua2-core/src/cua2-core/routes/websocket.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import json
+from fastapi import APIRouter, WebSocket, WebSocketDisconnect
+# Get services from app state
+from backend.app import app
+from backend.models.models import UserTaskMessage, WebSocketEvent
+# Create router
+router = APIRouter()
+@router.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket):
+    """WebSocket endpoint for real-time communication"""
+    websocket_manager = app.state.websocket_manager
+    agent_service = app.state.agent_service
+    await websocket_manager.connect(websocket)
+    try:
+        welcome_message = WebSocketEvent(
+            type="heartbeat",
+            content="WebSocket connection established successfully",
+            messageId="connection_welcome",
+        )
+        await websocket_manager.send_personal_message(welcome_message, websocket)
+        # Keep the connection alive and wait for messages
+        while True:
+            try:
+                # Wait for messages from client
+                data = await websocket.receive_text()
+                try:
+                    # Parse the message
+                    message_data = json.loads(data)
+                    message = UserTaskMessage(**message_data)
+                    # Process the user task
+                    if message.type == "user_task":
+                        message_id = await agent_service.process_user_task(
+                            message.content, message.model_id
+                        )
+                        # Send acknowledgment back to the client
+                        response = WebSocketEvent(
+                            type="agent_start",
+                            content=f"Received task: {message.content}",
+                            messageId=message_id,
+                        )
+                        await websocket_manager.send_personal_message(
+                            response, websocket
+                        )
+                except json.JSONDecodeError:
+                    error_response = WebSocketEvent(
+                        type="agent_error", content="Invalid JSON format"
+                    )
+                    await websocket_manager.send_personal_message(
+                        error_response, websocket
+                    )
+                except Exception as e:
+                    print(f"Error processing message: {e}")
+                    error_response = WebSocketEvent(
+                        type="agent_error",
+                        content=f"Error processing message: {str(e)}",
+                    )
+                    await websocket_manager.send_personal_message(
+                        error_response, websocket
+                    )
+            except Exception as e:
+                print(f"Error receiving WebSocket message: {e}")
+                # If we can't receive messages, the connection is likely broken
+                break
+    except WebSocketDisconnect:
+        print("WebSocket disconnected normally")
+    except Exception as e:
+        print(f"WebSocket connection error: {e}")
+    finally:
+        # Ensure cleanup happens
+        websocket_manager.disconnect(websocket)

cua2-core/src/cua2-core/services/agent_service.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import asyncio
+import uuid
+from datetime import datetime
+from typing import Optional
+from smolagents import Model
+from backend.models.models import ActiveTask, AgentMetadata
+from backend.services.agents.get_agents import get_agent
+from backend.services.models.get_model import get_model
+from backend.websocket.websocket_manager import WebSocketManager
+from computer_use_studio import Sandbox
+from computer_use_studio.logger import get_logger
+logger = get_logger(__name__)
+class AgentService:
+    """Service for handling agent tasks and processing"""
+    def __init__(self, websocket_manager):
+        self.active_tasks: dict[str, ActiveTask] = {}
+        self.websocket_manager: WebSocketManager = websocket_manager
+    async def process_user_task(self, content: str, model_id: str) -> str:
+        """Process a user task and return the message ID"""
+        message_id = str(uuid.uuid4())
+        while message_id in self.active_tasks.keys():
+            message_id = str(uuid.uuid4())
+        # Store the task
+        self.active_tasks[message_id] = ActiveTask(
+            message_id=message_id,
+            content=content,
+            model_id=model_id,
+            start_time=datetime.now(),
+            status="processing",
+        )
+        # Determine the agent type based on the content of the task (TODO: implement agent type detection using LLM)
+        prompt_type = "FORM_SYSTEM_PROMPT"
+        # Start the agent processing in the background
+        asyncio.create_task(
+            self._simulate_agent_processing(content, model_id, message_id, prompt_type)
+        )
+        return message_id
+    #     async def _simulate_agent_processing(self, message_id: str, content: str):
+    #         """Simulate agent processing with progress updates"""
+    #         try:
+    #             # Send agent start event
+    #             await self.websocket_manager.send_agent_start(
+    #                 content=f"Starting task: {content}", message_id=message_id
+    #             )
+    #
+    #             # Simulate processing steps
+    #             steps = [
+    #                 "Analyzing task requirements...",
+    #                 "Planning execution steps...",
+    #                 "Initializing computer interface...",
+    #                 "Executing task commands...",
+    #                 "Verifying results...",
+    #                 "Finalizing task completion...",
+    #             ]
+    #
+    #             for i, step in enumerate(steps):
+    #                 await asyncio.sleep(2)  # Simulate processing time
+    #
+    #                 # Send progress update
+    #                 await self.websocket_manager.send_agent_progress(
+    #                     content=f"{step} ({i + 1}/{len(steps)})", message_id=message_id
+    #                 )
+    #
+    #                 # Simulate VNC URL events during processing
+    #                 if i == 2:  # After "Initializing computer interface..."
+    #                     # Set VNC URL when computer interface is ready
+    #                     vnc_url = "http://localhost:6080/vnc.html?host=localhost&port=5900&autoconnect=true"
+    #                     await self.websocket_manager.send_vnc_url_set(
+    #                         vnc_url=vnc_url,
+    #                         content="Computer interface ready, VNC stream connected",
+    #                     )
+    #                 elif i == 4:  # After "Verifying results..."
+    #                     # Unset VNC URL when task is almost complete
+    #                     await self.websocket_manager.send_vnc_url_unset(
+    #                         content="Task verification complete, disconnecting VNC stream"
+    #                     )
+    #
+    #             # Calculate metadata
+    #             end_time = datetime.now()
+    #             start_time = self.active_tasks[message_id]["start_time"]
+    #             time_taken = (end_time - start_time).total_seconds()
+    #
+    #             metadata = AgentMetadata(
+    #                 tokensUsed=150 + len(content) * 2,  # Simulate token usage
+    #                 timeTaken=time_taken,
+    #                 numberOfSteps=len(steps),
+    #             )
+    #
+    #             # Send completion event
+    #             await self.websocket_manager.send_agent_complete(
+    #                 content=f"Task completed successfully: {content}",
+    #                 message_id=message_id,
+    #                 metadata=metadata,
+    #             )
+    #
+    #             # Clean up
+    #             if message_id in self.active_tasks:
+    #                 del self.active_tasks[message_id]
+    #
+    #         except Exception as e:
+    #             # Send error event
+    #             await self.websocket_manager.send_agent_error(
+    #                 content=f"Error processing task: {str(e)}", message_id=message_id
+    #             )
+    #
+    #             # Clean up
+    #             if message_id in self.active_tasks:
+    #                 del self.active_tasks[message_id]
+    def get_active_tasks(self) -> dict:
+        """Get currently active tasks"""
+        return self.active_tasks.copy()
+    def get_task_status(self, message_id: str) -> Optional[dict]:
+        """Get status of a specific task"""
+        return self.active_tasks.get(message_id)

cua2-core/src/cua2-core/services/agents/get_agents.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from typing import Annotated, TypeAlias
+from pydantic import Field
+from smolagents import Model
+from backend.models.models import AgentType
+from backend.services.agents.normalized_1000_agent import Normalized1000Agent
+from backend.services.agents.normalized_agent import NormalizedAgent
+from backend.services.agents.pixel_coordonates_agent import PixelCoordinatesAgent
+from backend.services.agents.prompt import (
+    Normalized1000CoordinatesSystemPrompt,
+    NormalizedCoordinatesSystemPrompt,
+    PixelCoordinatesSystemPrompt,
+)
+from computer_use_studio import Sandbox
+Agent: TypeAlias = Annotated[
+    PixelCoordinatesAgent | Normalized1000Agent | NormalizedAgent,
+    Field(discriminator="AGENT_TYPE"),
+]
+def get_agent(
+    model: Model,
+    desktop: Sandbox,
+    agent_type: AgentType,
+    prompt_type: str,
+    data_dir: str,
+    **kwargs,
+) -> Agent:
+    """Get the agent by type"""
+    if agent_type == AgentType.PIXEL_COORDINATES:
+        return PixelCoordinatesAgent(
+            model=model,
+            desktop=desktop,
+            system_prompt=PixelCoordinatesSystemPrompt[prompt_type].value,
+            data_dir=data_dir,
+            **kwargs,
+        )
+    elif agent_type == AgentType.NORMALIZED_1000_COORDINATES:
+        return Normalized1000Agent(
+            model=model,
+            desktop=desktop,
+            system_prompt=Normalized1000CoordinatesSystemPrompt[prompt_type].value,
+            data_dir=data_dir,
+            **kwargs,
+        )
+    elif agent_type == AgentType.NORMALIZED_COORDINATES:
+        return Normalized1000Agent(
+            model=model,
+            desktop=desktop,
+            system_prompt=NormalizedCoordinatesSystemPrompt[prompt_type].value,
+            data_dir=data_dir,
+            **kwargs,
+        )
+    else:
+        raise ValueError(f"Invalid agent type: {agent_type}")

cua2-core/src/cua2-core/services/agents/normalized_1000_agent.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import time
+import unicodedata
+from typing import List, Literal
+# SmolaAgents imports
+from smolagents import Model, Tool, tool
+from smolagents.monitoring import LogLevel
+from backend.models.models import AgentType
+from backend.services.agents.prompt import Normalized1000CoordinatesSystemPrompt
+from computer_use_studio import DesktopAgentBase, Sandbox
+class Normalized1000Agent(DesktopAgentBase):
+    """Agent for desktop automation with normalized coordinates (0 to 1000)"""
+    AGENT_TYPE = AgentType.NORMALIZED_1000_COORDINATES
+    def __init__(
+        self,
+        model: Model,
+        data_dir: str,
+        desktop: Sandbox,
+        system_prompt: Normalized1000CoordinatesSystemPrompt,
+        tools: List[Tool] | None = None,
+        max_steps: int = 20,
+        verbosity_level: LogLevel = LogLevel.INFO,
+        planning_interval: int | None = None,
+        use_v1_prompt: bool = False,
+        **kwargs,
+    ):
+        super().__init__(
+            model=model,
+            data_dir=data_dir,
+            desktop=desktop,
+            system_prompt=system_prompt,
+            tools=tools,
+            max_steps=max_steps,
+            verbosity_level=verbosity_level,
+            planning_interval=planning_interval,
+            use_v1_prompt=use_v1_prompt,
+            **kwargs,
+        )
+    def _normalize_to_pixel(self, norm_x: int, norm_y: int) -> tuple[int, int]:
+        """
+        Convert normalized coordinates (0-1000) to pixel coordinates
+        Args:
+            norm_x: Normalized x coordinate (0 to 1000)
+            norm_y: Normalized y coordinate (0 to 1000)
+        Returns:
+            Tuple of (pixel_x, pixel_y)
+        """
+        # Clamp values to valid range
+        norm_x = max(0, min(1000, norm_x))
+        norm_y = max(0, min(1000, norm_y))
+        # Convert from 0-1000 range to 0-1 range, then to pixels
+        norm_x_float = norm_x / 1000.0
+        norm_y_float = norm_y / 1000.0
+        pixel_x = int(norm_x_float * self.width)
+        pixel_y = int(norm_y_float * self.height)
+        # Ensure we don't go outside screen bounds
+        pixel_x = max(0, min(self.width - 1, pixel_x))
+        pixel_y = max(0, min(self.height - 1, pixel_y))
+        return pixel_x, pixel_y
+    def _setup_desktop_tools(self):
+        """Register all desktop tools with normalized coordinate support (0-1000)"""
+        @tool
+        def click(x: int, y: int) -> str:
+            """
+            Performs a left-click at the specified normalized coordinates
+            Args:
+                x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
+                y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
+            """
+            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
+            self.desktop.left_click(pixel_x, pixel_y)
+            self.click_coordinates = (pixel_x, pixel_y)
+            self.logger.log(
+                f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
+            )
+            time.sleep(1)
+            return f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
+        @tool
+        def right_click(x: int, y: int) -> str:
+            """
+            Performs a right-click at the specified normalized coordinates
+            Args:
+                x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
+                y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
+            """
+            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
+            self.desktop.right_click(pixel_x, pixel_y)
+            self.click_coordinates = (pixel_x, pixel_y)
+            self.logger.log(
+                f"Right-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
+            )
+            return f"Right-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
+        @tool
+        def double_click(x: int, y: int) -> str:
+            """
+            Performs a double-click at the specified normalized coordinates
+            Args:
+                x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
+                y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
+            """
+            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
+            self.desktop.double_click(pixel_x, pixel_y)
+            self.click_coordinates = (pixel_x, pixel_y)
+            self.logger.log(
+                f"Double-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
+            )
+            return f"Double-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
+        @tool
+        def move_mouse(x: int, y: int) -> str:
+            """
+            Moves the mouse cursor to the specified normalized coordinates
+            Args:
+                x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
+                y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
+            """
+            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
+            self.desktop.move_mouse(pixel_x, pixel_y)
+            self.logger.log(
+                f"Moved mouse to normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
+            )
+            return f"Moved mouse to normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
+        def normalize_text(text):
+            return "".join(
+                c
+                for c in unicodedata.normalize("NFD", text)
+                if not unicodedata.combining(c)
+            )
+        @tool
+        def write(text: str) -> str:
+            """
+            Types the specified text at the current cursor position.
+            Args:
+                text: The text to type
+            """
+            # clean_text = normalize_text(text)
+            self.desktop.write(text, delay_in_ms=10)
+            self.logger.log(f"Typed text: '{text}'")
+            time.sleep(1)
+            return f"Typed text: '{text}'"
+        @tool
+        def press(key: str) -> str:
+            """
+            Presses a keyboard key or combination of keys
+            Args:
+                key: The key to press (e.g. "enter", "space", "backspace", etc.) or a multiple keys string to press, for example "ctrl+a" or "ctrl+shift+a".
+            """
+            self.desktop.press(key)
+            self.logger.log(f"Pressed key: {key}")
+            time.sleep(0.1)
+            return f"Pressed key: {key}"
+        @tool
+        def drag(x1: int, y1: int, x2: int, y2: int) -> str:
+            """
+            Clicks at normalized coordinates [x1, y1], drags mouse to [x2, y2], then release click.
+            Args:
+                x1: origin normalized x coordinate (0 to 1000)
+                y1: origin normalized y coordinate (0 to 1000)
+                x2: end normalized x coordinate (0 to 1000)
+                y2: end normalized y coordinate (0 to 1000)
+            """
+            pixel_x1, pixel_y1 = self._normalize_to_pixel(x1, y1)
+            pixel_x2, pixel_y2 = self._normalize_to_pixel(x2, y2)
+            self.desktop.drag((pixel_x1, pixel_y1), (pixel_x2, pixel_y2))
+            message = f"Dragged and dropped from normalized [{x1}, {y1}] to [{x2}, {y2}] -> pixels [{pixel_x1}, {pixel_y1}] to [{pixel_x2}, {pixel_y2}]"
+            self.logger.log(message)
+            return message
+        @tool
+        def scroll(
+            x: int,
+            y: int,
+            direction: Literal["up", "down"] = "down",
+            amount: int = 2,
+        ) -> str:
+            """
+            Moves the mouse to selected normalized coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
+            Args:
+                x: The normalized x coordinate (0 to 1000) of the element to scroll/zoom
+                y: The normalized y coordinate (0 to 1000) of the element to scroll/zoom
+                direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
+                amount: The amount to scroll. A good amount is 1 or 2.
+            """
+            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
+            self.desktop.move_mouse(pixel_x, pixel_y)
+            self.desktop.scroll(direction=direction, amount=amount)
+            message = f"Scrolled {direction} by {amount} at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
+            self.logger.log(message)
+            return message
+        @tool
+        def wait(seconds: float) -> str:
+            """
+            Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
+            Args:
+                seconds: Number of seconds to wait, generally 3 is enough.
+            """
+            time.sleep(seconds)
+            self.logger.log(f"Waited for {seconds} seconds")
+            return f"Waited for {seconds} seconds"
+        @tool
+        def open(file_or_url: str) -> str:
+            """
+            Directly opens a browser with the specified url or opens a file with the default application: use this at start of web searches rather than trying to click the browser or open a file by clicking.
+            Args:
+                file_or_url: The URL or file to open
+            """
+            self.desktop.open(file_or_url)
+            # Give it time to load
+            time.sleep(2)
+            self.logger.log(f"Opening: {file_or_url}")
+            return f"Opened: {file_or_url}"
+        @tool
+        def launch_app(app_name: str) -> str:
+            """
+            Launches the specified application.
+            Args:
+                app_name: the name of the application to launch
+            """
+            self.desktop.launch(app_name)
+            self.logger.log(f"Launched app: {app_name}")
+            return f"Launched app: {app_name}"
+        @tool
+        def execute(command: str) -> str:
+            """
+            Executes a terminal command in the desktop environment.
+            Args:
+                command: The command to execute
+            """
+            self.desktop.execute_command(command)
+            self.logger.log(f"Executed command: {command}")
+            return f"Executed command: {command}"
+        @tool
+        def refresh() -> str:
+            """
+            Refreshes the current web page if you're in a browser.
+            """
+            self.desktop.press(["ctrl", "r"])
+            self.logger.log("Refreshed the current page")
+            return "Refreshed the current page"
+        @tool
+        def go_back() -> str:
+            """
+            Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
+            Args:
+            """
+            self.desktop.press(["alt", "left"])
+            self.logger.log("Went back one page")
+            return "Went back one page"
+        # Register the tools
+        self.tools["click"] = click
+        self.tools["right_click"] = right_click
+        self.tools["double_click"] = double_click
+        self.tools["move_mouse"] = move_mouse
+        self.tools["write"] = write
+        self.tools["press"] = press
+        self.tools["scroll"] = scroll
+        self.tools["wait"] = wait
+        self.tools["open"] = open
+        self.tools["go_back"] = go_back
+        self.tools["drag"] = drag
+        self.tools["launch_app"] = launch_app
+        self.tools["execute"] = execute
+        self.tools["refresh"] = refresh
+        self.tools["refresh"] = refresh
+        self.tools["execute"] = execute
+        self.tools["refresh"] = refresh
+        self.tools["refresh"] = refresh

cua2-core/src/cua2-core/services/agents/normalized_agent.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import time
+import unicodedata
+from typing import List, Literal
+# SmolaAgents imports
+from smolagents import Model, Tool, tool
+from smolagents.monitoring import LogLevel
+from backend.models.models import AgentType
+from backend.services.agents.prompt import NormalizedCoordinatesSystemPrompt
+from computer_use_studio import DesktopAgentBase, Sandbox
+class NormalizedAgent(DesktopAgentBase):
+    """Agent for desktop automation with normalized coordinates (0.0 to 1.0)"""
+    AGENT_TYPE = AgentType.NORMALIZED_COORDINATES
+    def __init__(
+        self,
+        model: Model,
+        data_dir: str,
+        desktop: Sandbox,
+        system_prompt: NormalizedCoordinatesSystemPrompt,
+        tools: List[Tool] | None = None,
+        max_steps: int = 20,
+        verbosity_level: LogLevel = LogLevel.INFO,
+        planning_interval: int | None = None,
+        use_v1_prompt: bool = False,
+        **kwargs,
+    ):
+        super().__init__(
+            model=model,
+            data_dir=data_dir,
+            desktop=desktop,
+            system_prompt=system_prompt,
+            tools=tools,
+            max_steps=max_steps,
+            verbosity_level=verbosity_level,
+            planning_interval=planning_interval,
+            use_v1_prompt=use_v1_prompt,
+            **kwargs,
+        )
+    def _normalize_to_pixel(self, norm_x: float, norm_y: float) -> tuple[int, int]:
+        """
+        Convert normalized coordinates (0.0-1.0) to pixel coordinates
+        Args:
+            norm_x: Normalized x coordinate (0.0 to 1.0)
+            norm_y: Normalized y coordinate (0.0 to 1.0)
+        Returns:
+            Tuple of (pixel_x, pixel_y)
+        """
+        # Clamp values to valid range
+        norm_x = max(0.0, min(1.0, norm_x))
+        norm_y = max(0.0, min(1.0, norm_y))
+        pixel_x = int(norm_x * self.width)
+        pixel_y = int(norm_y * self.height)
+        # Ensure we don't go outside screen bounds
+        pixel_x = max(0, min(self.width - 1, pixel_x))
+        pixel_y = max(0, min(self.height - 1, pixel_y))
+        return pixel_x, pixel_y
+    def _setup_desktop_tools(self):
+        """Register all desktop tools with normalized coordinate support"""
+        @tool
+        def click(x: float, y: float) -> str:
+            """
+            Performs a left-click at the specified normalized coordinates
+            Args:
+                x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
+                y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
+            """
+            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
+            self.desktop.left_click(pixel_x, pixel_y)
+            self.click_coordinates = (pixel_x, pixel_y)
+            self.logger.log(
+                f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
+            )
+            return f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
+        @tool
+        def right_click(x: float, y: float) -> str:
+            """
+            Performs a right-click at the specified normalized coordinates
+            Args:
+                x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
+                y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
+            """
+            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
+            self.desktop.right_click(pixel_x, pixel_y)
+            self.click_coordinates = (pixel_x, pixel_y)
+            self.logger.log(
+                f"Right-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
+            )
+            return f"Right-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
+        @tool
+        def double_click(x: float, y: float) -> str:
+            """
+            Performs a double-click at the specified normalized coordinates
+            Args:
+                x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
+                y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
+            """
+            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
+            self.desktop.double_click(pixel_x, pixel_y)
+            self.click_coordinates = (pixel_x, pixel_y)
+            self.logger.log(
+                f"Double-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
+            )
+            return f"Double-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
+        @tool
+        def move_mouse(x: float, y: float) -> str:
+            """
+            Moves the mouse cursor to the specified normalized coordinates
+            Args:
+                x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
+                y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
+            """
+            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
+            self.desktop.move_mouse(pixel_x, pixel_y)
+            self.logger.log(
+                f"Moved mouse to normalized coordinates ({pixel_x}, {pixel_y})"
+            )
+            return f"Moved mouse to normalized coordinates ({pixel_x}, {pixel_y})"
+        def normalize_text(text):
+            return "".join(
+                c
+                for c in unicodedata.normalize("NFD", text)
+                if not unicodedata.combining(c)
+            )
+        @tool
+        def write(text: str) -> str:
+            """
+            Types the specified text at the current cursor position.
+            Args:
+                text: The text to type
+            """
+            # clean_text = normalize_text(text)
+            self.desktop.write(text, delay_in_ms=10)
+            self.logger.log(f"Typed text: '{text}'")
+            return f"Typed text: '{text}'"
+        @tool
+        def press(key: str) -> str:
+            """
+            Presses a keyboard key or combination of keys
+            Args:
+                key: The key to press (e.g. "enter", "space", "backspace", etc.) or a multiple keys string to press, for example "ctrl+a" or "ctrl+shift+a".
+            """
+            self.desktop.press(key)
+            self.logger.log(f"Pressed key: {key}")
+            return f"Pressed key: {key}"
+        @tool
+        def drag(x1: float, y1: float, x2: float, y2: float) -> str:
+            """
+            Clicks at normalized coordinates [x1, y1], drags mouse to [x2, y2], then release click.
+            Args:
+                x1: origin normalized x coordinate (0.0 to 1.0)
+                y1: origin normalized y coordinate (0.0 to 1.0)
+                x2: end normalized x coordinate (0.0 to 1.0)
+                y2: end normalized y coordinate (0.0 to 1.0)
+            """
+            pixel_x1, pixel_y1 = self._normalize_to_pixel(x1, y1)
+            pixel_x2, pixel_y2 = self._normalize_to_pixel(x2, y2)
+            self.desktop.drag((pixel_x1, pixel_y1), (pixel_x2, pixel_y2))
+            message = f"Dragged and dropped from normalized [{pixel_x1}, {pixel_y1}] to [{pixel_x2}, {pixel_y2}]"
+            self.logger.log(message)
+            return message
+        @tool
+        def scroll(
+            x: float,
+            y: float,
+            direction: Literal["up", "down"] = "down",
+            amount: int = 2,
+        ) -> str:
+            """
+            Moves the mouse to selected normalized coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
+            Args:
+                x: The normalized x coordinate (0.0 to 1.0) of the element to scroll/zoom
+                y: The normalized y coordinate (0.0 to 1.0) of the element to scroll/zoom
+                direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
+                amount: The amount to scroll. A good amount is 1 or 2.
+            """
+            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
+            self.desktop.move_mouse(pixel_x, pixel_y)
+            self.desktop.scroll(direction=direction, amount=amount)
+            message = f"Scrolled {direction} by {amount} at normalized coordinates ({pixel_x}, {pixel_y})"
+            self.logger.log(message)
+            return message
+        @tool
+        def wait(seconds: float) -> str:
+            """
+            Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
+            Args:
+                seconds: Number of seconds to wait, generally 3 is enough.
+            """
+            time.sleep(seconds)
+            self.logger.log(f"Waited for {seconds} seconds")
+            return f"Waited for {seconds} seconds"
+        @tool
+        def open(file_or_url: str) -> str:
+            """
+            Directly opens a browser with the specified url or opens a file with the default application: use this at start of web searches rather than trying to click the browser or open a file by clicking.
+            Args:
+                file_or_url: The URL or file to open
+            """
+            self.desktop.open(file_or_url)
+            # Give it time to load
+            time.sleep(2)
+            self.logger.log(f"Opening: {file_or_url}")
+            return f"Opened: {file_or_url}"
+        @tool
+        def launch_app(app_name: str) -> str:
+            """
+            Launches the specified application.
+            Args:
+                app_name: the name of the application to launch
+            """
+            self.desktop.launch(app_name)
+            self.logger.log(f"Launched app: {app_name}")
+            return f"Launched app: {app_name}"
+        @tool
+        def execute(command: str) -> str:
+            """
+            Executes a terminal command in the desktop environment.
+            Args:
+                command: The command to execute
+            """
+            self.desktop.execute_command(command)
+            self.logger.log(f"Executed command: {command}")
+            return f"Executed command: {command}"
+        @tool
+        def refresh() -> str:
+            """
+            Refreshes the current web page if you're in a browser.
+            """
+            self.desktop.press(["ctrl", "r"])
+            self.logger.log("Refreshed the current page")
+            return "Refreshed the current page"
+        @tool
+        def go_back() -> str:
+            """
+            Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
+            Args:
+            """
+            self.desktop.press(["alt", "left"])
+            self.logger.log("Went back one page")
+            return "Went back one page"
+        # Register the tools
+        self.tools["click"] = click
+        self.tools["right_click"] = right_click
+        self.tools["double_click"] = double_click
+        self.tools["move_mouse"] = move_mouse
+        self.tools["write"] = write
+        self.tools["press"] = press
+        self.tools["scroll"] = scroll
+        self.tools["wait"] = wait
+        self.tools["open"] = open
+        self.tools["go_back"] = go_back
+        self.tools["drag"] = drag
+        self.tools["launch_app"] = launch_app
+        self.tools["execute"] = execute
+        self.tools["refresh"] = refresh

cua2-core/src/cua2-core/services/agents/pixel_coordonates_agent.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import time
+import unicodedata
+from typing import List, Literal
+# SmolaAgents imports
+from smolagents import Model, Tool, tool
+from smolagents.monitoring import LogLevel
+from backend.models.models import AgentType
+from backend.services.agents.prompt import PixelCoordinatesSystemPrompt
+from computer_use_studio import DesktopAgentBase, Sandbox
+class PixelCoordinatesAgent(DesktopAgentBase):
+    """Agent for desktop automation"""
+    AGENT_TYPE = AgentType.PIXEL_COORDINATES
+    def __init__(
+        self,
+        model: Model,
+        data_dir: str,
+        desktop: Sandbox,
+        system_prompt: PixelCoordinatesSystemPrompt,
+        tools: List[Tool] | None = None,
+        max_steps: int = 20,
+        verbosity_level: LogLevel = LogLevel.INFO,
+        planning_interval: int | None = None,
+        use_v1_prompt: bool = False,
+        **kwargs,
+    ):
+        super().__init__(
+            model=model,
+            data_dir=data_dir,
+            desktop=desktop,
+            system_prompt=system_prompt,
+            tools=tools,
+            max_steps=max_steps,
+            verbosity_level=verbosity_level,
+            planning_interval=planning_interval,
+            use_v1_prompt=use_v1_prompt,
+            **kwargs,
+        )
+        # OPTIONAL: Add a custom prompt template - see src/computer_use_studio/desktop_agent/desktop_agent_base.py for more details about the default prompt template
+        # self.prompt_templates["system_prompt"] = CUSTOM_PROMPT_TEMPLATE.replace(
+        #     "<<resolution_x>>", str(self.width)
+        # ).replace("<<resolution_y>>", str(self.height))
+        # Important: Change the prompt to get better results, depending on your action space.
+    def _setup_desktop_tools(self):
+        """Register all desktop tools"""
+        @tool
+        def click(x: int, y: int) -> str:
+            """
+            Performs a left-click at the specified coordinates
+            Args:
+                x: The x coordinate (horizontal position)
+                y: The y coordinate (vertical position)
+            """
+            self.desktop.left_click(x, y)
+            self.click_coordinates = (x, y)
+            self.logger.log(f"Clicked at coordinates ({x}, {y})")
+            return f"Clicked at coordinates ({x}, {y})"
+        @tool
+        def right_click(x: int, y: int) -> str:
+            """
+            Performs a right-click at the specified coordinates
+            Args:
+                x: The x coordinate (horizontal position)
+                y: The y coordinate (vertical position)
+            """
+            self.desktop.right_click(x, y)
+            self.click_coordinates = (x, y)
+            self.logger.log(f"Right-clicked at coordinates ({x}, {y})")
+            return f"Right-clicked at coordinates ({x}, {y})"
+        @tool
+        def double_click(x: int, y: int) -> str:
+            """
+            Performs a double-click at the specified coordinates
+            Args:
+                x: The x coordinate (horizontal position)
+                y: The y coordinate (vertical position)
+            """
+            self.desktop.double_click(x, y)
+            self.click_coordinates = (x, y)
+            self.logger.log(f"Double-clicked at coordinates ({x}, {y})")
+            return f"Double-clicked at coordinates ({x}, {y})"
+        @tool
+        def move_mouse(x: int, y: int) -> str:
+            """
+            Moves the mouse cursor to the specified coordinates
+            Args:
+                x: The x coordinate (horizontal position)
+                y: The y coordinate (vertical position)
+            """
+            self.desktop.move_mouse(x, y)
+            self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
+            return f"Moved mouse to coordinates ({x}, {y})"
+        def normalize_text(text):
+            return "".join(
+                c
+                for c in unicodedata.normalize("NFD", text)
+                if not unicodedata.combining(c)
+            )
+        @tool
+        def write(text: str) -> str:
+            """
+            Types the specified text at the current cursor position.
+            Args:
+                text: The text to type
+            """
+            # clean_text = normalize_text(text)
+            self.desktop.write(text, delay_in_ms=10)
+            self.logger.log(f"Typed text: '{text}'")
+            return f"Typed text: '{text}'"
+        @tool
+        def press(key: str) -> str:
+            """
+            Presses a keyboard key or combination of keys
+            Args:
+                key: The key to press (e.g. "enter", "space", "backspace", etc.) or a multiple keys string to press, for example "ctrl+a" or "ctrl+shift+a".
+            """
+            self.desktop.press(key)
+            self.logger.log(f"Pressed key: {key}")
+            return f"Pressed key: {key}"
+        @tool
+        def drag(x1: int, y1: int, x2: int, y2: int) -> str:
+            """
+            Clicks [x1, y1], drags mouse to [x2, y2], then release click.
+            Args:
+                x1: origin x coordinate
+                y1: origin y coordinate
+                x2: end x coordinate
+                y2: end y coordinate
+            """
+            self.desktop.drag((x1, y1), (x2, y2))
+            message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
+            self.logger.log(message)
+            return message
+        @tool
+        def scroll(
+            x: int, y: int, direction: Literal["up", "down"] = "down", amount: int = 2
+        ) -> str:
+            """
+            Moves the mouse to selected coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
+            Args:
+                x: The x coordinate (horizontal position) of the element to scroll/zoom
+                y: The y coordinate (vertical position) of the element to scroll/zoom
+                direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
+                amount: The amount to scroll. A good amount is 1 or 2.
+            """
+            self.desktop.move_mouse(x, y)
+            self.desktop.scroll(direction=direction, amount=amount)
+            message = f"Scrolled {direction} by {amount}"
+            self.logger.log(message)
+            return message
+        @tool
+        def wait(seconds: float) -> str:
+            """
+            Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
+            Args:
+                seconds: Number of seconds to wait, generally 3 is enough.
+            """
+            time.sleep(seconds)
+            self.logger.log(f"Waited for {seconds} seconds")
+            return f"Waited for {seconds} seconds"
+        @tool
+        def open(file_or_url: str) -> str:
+            """
+            Directly opens a browser with the specified url or opens a file with the default application: use this at start of web searches rather than trying to click the browser or open a file by clicking.
+            Args:
+                file_or_url: The URL or file to open
+            """
+            self.desktop.open(file_or_url)
+            # Give it time to load
+            time.sleep(2)
+            self.logger.log(f"Opening: {file_or_url}")
+            return f"Opened: {file_or_url}"
+        @tool
+        def launch_app(app_name: str) -> str:
+            """
+            Launches the specified application.
+            Args:
+                app_name: the name of the application to launch
+            """
+            self.desktop.launch(app_name)
+            self.logger.log(f"Launched app: {app_name}")
+            return f"Launched app: {app_name}"
+        @tool
+        def execute(command: str) -> str:
+            """
+            Executes a terminal command in the desktop environment.
+            Args:
+                command: The command to execute
+            """
+            self.desktop.execute_command(command)
+            self.logger.log(f"Executed command: {command}")
+            return f"Executed command: {command}"
+        @tool
+        def refresh() -> str:
+            """
+            Refreshes the current web page if you're in a browser.
+            """
+            self.desktop.press(["ctrl", "r"])
+            self.logger.log("Refreshed the current page")
+            return "Refreshed the current page"
+        @tool
+        def go_back() -> str:
+            """
+            Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
+            Args:
+            """
+            self.desktop.press(["alt", "left"])
+            self.logger.log("Went back one page")
+            return "Went back one page"
+        # Register the tools
+        self.tools["click"] = click
+        self.tools["right_click"] = right_click
+        self.tools["double_click"] = double_click
+        self.tools["move_mouse"] = move_mouse
+        self.tools["write"] = write
+        self.tools["press"] = press
+        self.tools["scroll"] = scroll
+        self.tools["wait"] = wait
+        self.tools["open"] = open
+        self.tools["go_back"] = go_back
+        self.tools["drag"] = drag
+        self.tools["launch_app"] = launch_app
+        self.tools["execute"] = execute
+        self.tools["refresh"] = refresh
+if __name__ == "__main__":
+    # ================================
+    # MODEL CONFIGURATION
+    # ================================
+    # import os
+    # from smolagents import OpenAIServerModel
+    # model = OpenAIServerModel(
+    #     model_id="gpt-4.1",
+    #     api_key=os.getenv("OPENAI_API_KEY"),
+    # )
+    # For Inference Endpoints
+    # from smolagents import HfApiModel
+    # model = HfApiModel(
+    #     model_id="Qwen/Qwen2.5-VL-72B-Instruct",
+    #     token=os.getenv("HF_TOKEN"),
+    #     provider="nebius",
+    # )
+    # For Transformer models
+    # from smolagents import TransformersModel
+    # model = TransformersModel(
+    #     model_id="Qwen/Qwen2.5-VL-72B-Instruct",
+    #     device_map="auto",
+    #     torch_dtype="auto",
+    #     trust_remote_code=True,
+    # )
+    # For other providers
+    from smolagents import LiteLLMModel
+    model = LiteLLMModel(model_id="anthropic/claude-sonnet-4-5-20250929")
+    # model = LiteLLMModel(model_id="gemini/gemini-2.5-flash")
+    # ================================
+    # RUN AGENT
+    # ================================
+    # Interactive task input loop
+    sandbox = None
+    agent = None
+    while True:
+        try:
+            task = get_user_input()
+            if task is None:
+                exit()
+            sandbox = Sandbox(headless=False, resolution=(1024, 1024))
+            sandbox.start_recording()
+            agent = FormAgent(model=model, data_dir="data", desktop=sandbox)
+            print("\n🤖 Agent is working on your task...")
+            print("-" * 60)
+            result = agent.run(task)
+            print("\n✅ Task completed successfully!")
+            print(f"📄 Result: {result}")
+        except Exception as e:
+            print(f"\n❌ Error occurred: {str(e)}")
+        finally:
+            if sandbox:
+                sandbox.end_recording("recording.mp4")
+            if agent:
+                agent.close()
+        print("\n" + "=" * 60)

cua2-core/src/cua2-core/services/agents/prompt.py ADDED Viewed

	@@ -0,0 +1,548 @@

+from enum import Enum
+class PixelCoordinatesSystemPrompt(Enum):
+    """Pixel coordinates system prompt"""
+    FORM_SYSTEM_PROMPT = """You are a web form automation assistant that can control a remote desktop environment with a web browser open.
+The current date is <<current_date>>.
+<action_process>
+You will be given a task to complete in several steps (e.g. filling forms, signing up, logging in, submitting claims).
+At each step you will perform **one action**.
+After each action, you will receive an updated screenshot.
+Then you will proceed as follows, with these sections — do not skip any:
+Short term goal: ...
+What I see: ...
+Reflection: ...
+Action:
+```python
+tool_name(arguments)
+```<end_code>
+Always format your Action section as **Python code blocks** exactly as shown above.
+</action_process>
+<tools>
+On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
+{%- for tool in tools.values() %}
+- {{ tool.name }}: {{ tool.description }}
+    Takes inputs: {{tool.inputs}}
+    Returns an output of type: {{tool.output_type}}
+{%- endfor %}
+</tools>
+<web_form_guidelines>
+Look at the elements on the screen (input fields, checkboxes, buttons, dropdowns) to decide where to interact.
+The desktop has a resolution of <<resolution_x>>x<<resolution_y>> pixels — use that to decide mouse coordinates.
+**Never use hypothetical or assumed coordinates; always use real coordinates visible on the screenshot.**
+### Typical Web Form Interactions
+- **Input fields**: click in the field first to focus it, then use `write("text")`.
+- **Passwords**: type them just like text — `write("password123")`.
+- **Checkboxes / radio buttons**: use `click(x,y)` directly on the box/circle.
+- **Dropdown menus**: click to open, then click the option or use arrow keys + press("enter").
+- **Submit buttons**: identify clearly labelled “Sign up”, “Sign in”, “Submit” buttons and click at their coordinates.
+- **Captcha or 2FA**: wait or prompt for external handling (do not bypass security).
+- **Pop-ups**: try `press("enter")` to confirm or `press("escape")` to close if they block your action.
+### Grouping Multiple Inputs
+- If multiple fields, checkboxes, or similar controls are clearly visible **and** can be filled/clicked in sequence without ambiguity, you may include **several actions in one code block**.
+- Keep each Action still in a **Python code block** but with multiple tool calls inside, for example:
+```python
+click(450, 320)        # Email field
+wait(0.1)
+write("[email protected]")
+click(450, 380)        # Password field
+wait(0.1)
+write("mypassword123")
+click(430, 600)        # Checkbox “Accept terms”
+wait(0.1)
+```<end_code>
+- Only group actions when:
+  1. They’re all part of the **same form or step**,
+  2. The screenshot clearly shows all elements and coordinates,
+  3. The order of operations is obvious.
+- Otherwise, default back to one Action per step.
+### Precision
+- Always **click before typing** to ensure the right field is active.
+- Always **scroll if needed** to bring elements into view before clicking.
+- Always **validate each action** via the screenshot before continuing.
+</web_form_guidelines>
+<task_resolution_example>
+For a task like “Sign up for an account and submit the form”:
+Step 1:
+Short term goal: I want to open the signup page.
+What I see: The browser is open on the homepage.
+Reflection: I will open the signup URL directly.
+Action:
+```python
+open("https://example.com/signup")
+wait(3)
+```<end_code>
+Step 2:
+Short term goal: I want to fill the “Email” field.
+What I see: I see the signup form with an “Email” field at (450, 320).
+Reflection: I will click inside the field then type my email.
+Action:
+```python
+click(450, 320)
+write("[email protected]")
+```<end_code>
+Step 3:
+Short term goal: I want to check the “I accept terms” checkbox.
+What I see: The checkbox is at (430, 600).
+Reflection: I will click it.
+Action:
+```python
+click(430, 600)
+```<end_code>
+Step 4:
+Short term goal: I want to submit the form.
+What I see: The “Sign Up” button at (500, 700).
+Reflection: I will click the button to submit.
+Action:
+```python
+click(500, 700)
+wait(3)
+```<end_code>
+Step 5:
+Short term goal: Verify signup completed.
+What I see: A confirmation page “Welcome [email protected]”.
+Reflection: Task complete.
+Action:
+```python
+final_answer("Signup completed")
+```<end_code>
+</task_resolution_example>
+<general_guidelines>
+# GUI Agent Guidelines for Web Forms
+## Environment Overview
+Ubuntu 22.04 XFCE4 desktop with Google Chrome/Chromium browser.
+Agent can fill forms, sign up, sign in, click checkboxes, submit claims.
+## Core Principles
+### 1. Screenshot Analysis
+- Always analyze the latest screenshot carefully before each action.
+- Validate that previous actions worked by examining the current state.
+- If an action didn’t work, try an alternative rather than repeating blindly.
+### 2. Action Execution
+- Execute one action or multiple actions at a time (grouped in one code block).
+- Wait for appropriate loading times using `wait()` but not indefinitely.
+- Scroll to bring hidden elements into view.
+### 3. Keyboard Shortcuts
+- Use `tab` to move between fields, `space` to toggle checkboxes, `enter` to submit forms.
+- Copy/paste: `ctrl+C`, `ctrl+V`.
+- Refresh page: `refresh()`.
+### 4. Error Recovery
+- If clicking doesn’t work, try double_click or right_click.
+- If typing doesn’t appear, ensure the field is focused with click.
+- If popups block the screen, try `press("enter")` or `press("escape")`.
+### 5. Security & Privacy
+- Don’t attempt to bypass captchas or 2FA automatically.
+- Don’t store credentials in plain text unless instructed.
+### 6. Final Answer
+- When the form is successfully submitted or the goal achieved, use:
+```python
+final_answer("Done")
+```<end_code>
+</general_guidelines>
+"""
+class Normalized1000CoordinatesSystemPrompt(Enum):
+    """Normalized 1000 coordinates system prompt"""
+    FORM_SYSTEM_PROMPT = """You are a web form automation assistant that can control a remote desktop environment with a web browser open.
+The current date is <<current_date>>.
+<action_process>
+You will be given a task to complete in several steps (e.g. filling forms, signing up, logging in, submitting claims).
+At each step you will perform **one action**.
+After each action, you will receive an updated screenshot.
+Then you will proceed as follows, with these sections — do not skip any:
+Short term goal: ...
+What I see: ...
+Reflection: ...
+Action:
+```python
+tool_name(arguments)
+```<end_code>
+Always format your Action section as **Python code blocks** exactly as shown above.
+</action_process>
+<tools>
+On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
+{%- for tool in tools.values() %}
+- {{ tool.name }}: {{ tool.description }}
+    Takes inputs: {{tool.inputs}}
+    Returns an output of type: {{tool.output_type}}
+{%- endfor %}
+</tools>
+<coordinate_system>
+**IMPORTANT: This system uses NORMALIZED COORDINATES (0 to 1000)**
+You must use normalized coordinates:
+- **x-coordinate**: 0 = left edge, 1000 = right edge of screen
+- **y-coordinate**: 0 = top edge, 1000 = bottom edge of screen
+- **Example**: Center of screen is (500, 500)
+- **Example**: Top-left corner is (0, 0)
+- **Example**: Bottom-right corner is (1000, 1000)
+When you see an element on the screenshot:
+1. Estimate its position relative to the screen dimensions
+2. Convert to normalized coordinates between 0 and 1000
+3. Use these normalized coordinates in your tool calls
+**Never use pixel coordinates directly - always use normalized coordinates between 0 and 1000**
+</coordinate_system>
+<web_form_guidelines>
+Look at the elements on the screen (input fields, checkboxes, buttons, dropdowns) to decide where to interact.
+**Always use normalized coordinates (0 to 1000) based on the element's relative position on the screen.**
+### Typical Web Form Interactions
+- **Input fields**: click in the field first to focus it, then use `write("text")`.
+- **Passwords**: type them just like text — `write("password123")`.
+- **Checkboxes / radio buttons**: use `click(x,y)` directly on the box/circle. Click on the box/circle itself at the left side of the text, not on the text label.
+- **Dropdown menus**: click to open, then click the option or use arrow keys + press("enter").
+- **Submit buttons**: identify clearly labelled "Sign up", "Sign in", "Submit" buttons and click at their normalized coordinates.
+- **Captcha or 2FA**: wait or prompt for external handling (do not bypass security).
+- **Pop-ups**: try `press("enter")` to confirm or `press("escape")` to close if they block your action.
+### Grouping Multiple Inputs
+- If multiple fields, checkboxes, or similar controls are clearly visible **and** can be filled/clicked in sequence without ambiguity, you may include **several actions in one code block**.
+- Keep each Action still in a **Python code block** but with multiple tool calls inside, for example:
+```python
+click(470, 300)        # Email field (normalized coordinates)
+write("[email protected]")
+click(470, 350)        # Password field (normalized coordinates)
+write("mypassword123")
+click(450, 550)        # Checkbox left side of the text "Accept terms" (normalized coordinates)
+```<end_code>
+- Only group actions when:
+  1. They're all part of the **same form or step**,
+  2. The screenshot clearly shows all elements and coordinates,
+  3. The order of operations is obvious.
+- Otherwise, default back to one Action per step.
+### Precision
+- Always **click before typing** to ensure the right field is active.
+- Always **scroll if needed** to bring elements into view before clicking.
+- Always **validate each action** via the screenshot before continuing.
+- Always use **normalized coordinates between 0 and 1000**.
+</web_form_guidelines>
+<task_resolution_example>
+For a task like "Sign up for an account and submit the form":
+Step 1:
+Short term goal: I want to open the signup page.
+What I see: The browser is open on the homepage.
+Reflection: I will open the signup URL directly.
+Action:
+```python
+open("https://example.com/signup")
+wait(3)
+```<end_code>
+Step 2:
+Short term goal: I want to fill the form fields that are currently visible.
+What I see: I see the signup form with "Email" and "Password" fields, plus a checkbox for accepting terms.
+Reflection: I will fill all the visible form fields in sequence - click the email field and type the email, then click the password field and type the password, then click the checkbox to accept terms.
+Action:
+```python
+click(470, 300)        # Email field (normalized coordinates)
+write("[email protected]")
+click(470, 350)        # Password field (normalized coordinates)
+write("mypassword123")
+click(450, 550)        # Checkbox left side of the text "Accept terms" (normalized coordinates)
+```<end_code>
+Step 3:
+Short term goal: I need to scroll down to see the "Sign Up" button.
+What I see: The form fields are filled, but I cannot see the "Sign Up" button - it's likely below the current view.
+Reflection: I will scroll down to bring the submit button into view so I can click it in the next step.
+Action:
+```python
+scroll(500, 500, "down", 3)
+```<end_code>
+Step 4:
+Short term goal: I want to submit the form.
+What I see: The "Sign Up" button is at the bottom center, around 520, 650 in normalized coordinates.
+Reflection: I will click the button to submit.
+Action:
+```python
+click(520, 650)
+wait(3)
+```<end_code>
+Step 5:
+Short term goal: Verify signup completed.
+What I see: A confirmation page "Welcome [email protected]".
+Reflection: Task complete.
+Action:
+```python
+final_answer("Signup completed")
+```<end_code>
+</task_resolution_example>
+<general_guidelines>
+# GUI Agent Guidelines for Web Forms (0-1000 Coordinates)
+## Environment Overview
+Ubuntu 22.04 XFCE4 desktop with Google Chrome/Chromium browser.
+Agent can fill forms, sign up, sign in, click checkboxes, submit claims.
+**All coordinates are normalized between 0 and 1000.**
+## Core Principles
+### 1. Screenshot Analysis
+- Always analyze the latest screenshot carefully before each action.
+- Validate that previous actions worked by examining the current state.
+- If an action didn't work, try an alternative rather than repeating blindly.
+### 2. Action Execution
+- Execute one or multiple actions at a time (grouped in one code block).
+- Wait for appropriate loading times using `wait()` but not indefinitely.
+- Scroll to bring hidden elements into view.
+### 3. Coordinate System
+- **CRITICAL**: Always use normalized coordinates (0 to 1000)
+- Convert visual position on screen to normalized coordinates
+- Center of screen = (500, 500)
+- Top-left = (0, 0), Bottom-right = (1000, 1000)
+### 4. Keyboard Shortcuts
+- Use `tab` to move between fields, `space` to toggle checkboxes, `enter` to submit forms.
+- Copy/paste: `ctrl+C`, `ctrl+V`.
+- Refresh page: `refresh()`.
+### 5. Error Recovery
+- If clicking doesn't work, try double_click or right_click.
+- If typing doesn't appear, ensure the field is focused with click.
+- If popups block the screen, try `press("enter")` or `press("escape")`.
+### 6. Security & Privacy
+- Don't attempt to bypass captchas or 2FA automatically.
+- Don't store credentials in plain text unless instructed.
+### 7. Final Answer
+- When the form is successfully submitted or the goal achieved, use:
+```python
+final_answer("Done")
+```<end_code>
+</general_guidelines>
+"""
+class NormalizedCoordinatesSystemPrompt(Enum):
+    """Normalized coordinates system prompt"""
+    FORM_SYSTEM_PROMPT = """You are a web form automation assistant that can control a remote desktop environment with a web browser open.
+The current date is <<current_date>>.
+<action_process>
+You will be given a task to complete in several steps (e.g. filling forms, signing up, logging in, submitting claims).
+At each step you will perform **one action**.
+After each action, you will receive an updated screenshot.
+Then you will proceed as follows, with these sections — do not skip any:
+Short term goal: ...
+What I see: ...
+Reflection: ...
+Action:
+```python
+tool_name(arguments)
+```<end_code>
+Always format your Action section as **Python code blocks** exactly as shown above.
+</action_process>
+<tools>
+On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
+{%- for tool in tools.values() %}
+- {{ tool.name }}: {{ tool.description }}
+    Takes inputs: {{tool.inputs}}
+    Returns an output of type: {{tool.output_type}}
+{%- endfor %}
+</tools>
+<coordinate_system>
+**IMPORTANT: This system uses NORMALIZED COORDINATES (0.0 to 1.0)**
+You must use normalized coordinates:
+- **x-coordinate**: 0.0 = left edge, 1.0 = right edge of screen
+- **y-coordinate**: 0.0 = top edge, 1.0 = bottom edge of screen
+- **Example**: Center of screen is (0.5, 0.5)
+- **Example**: Top-left corner is (0.0, 0.0)
+- **Example**: Bottom-right corner is (1.0, 1.0)
+When you see an element on the screenshot:
+1. Estimate its position relative to the screen dimensions
+2. Convert to normalized coordinates between 0.0 and 1.0
+3. Use these normalized coordinates in your tool calls
+**Never use pixel coordinates directly - always use normalized coordinates between 0.0 and 1.0**
+</coordinate_system>
+<web_form_guidelines>
+Look at the elements on the screen (input fields, checkboxes, buttons, dropdowns) to decide where to interact.
+**Always use normalized coordinates (0.0 to 1.0) based on the element's relative position on the screen.**
+### Typical Web Form Interactions
+- **Input fields**: click in the field first to focus it, then use `write("text")`.
+- **Passwords**: type them just like text — `write("password123")`.
+- **Checkboxes / radio buttons**: use `click(x,y)` directly on the box/circle.
+- **Dropdown menus**: click to open, then click the option or use arrow keys + press("enter").
+- **Submit buttons**: identify clearly labelled "Sign up", "Sign in", "Submit" buttons and click at their normalized coordinates.
+- **Captcha or 2FA**: wait or prompt for external handling (do not bypass security).
+- **Pop-ups**: try `press("enter")` to confirm or `press("escape")` to close if they block your action.
+### Grouping Multiple Inputs
+- If multiple fields, checkboxes, or similar controls are clearly visible **and** can be filled/clicked in sequence without ambiguity, you may include **several actions in one code block**.
+- Keep each Action still in a **Python code block** but with multiple tool calls inside, for example:
+```python
+click(0.47, 0.30)        # Email field (normalized coordinates)
+wait(0.1)
+write("[email protected]")
+click(0.47, 0.35)        # Password field (normalized coordinates)
+wait(0.1)
+write("mypassword123")
+click(0.45, 0.55)        # Checkbox "Accept terms" (normalized coordinates)
+wait(0.1)
+```<end_code>
+- Only group actions when:
+  1. They're all part of the **same form or step**,
+  2. The screenshot clearly shows all elements and coordinates,
+  3. The order of operations is obvious.
+- Otherwise, default back to one Action per step.
+### Precision
+- Always **click before typing** to ensure the right field is active.
+- Always **scroll if needed** to bring elements into view before clicking.
+- Always **validate each action** via the screenshot before continuing.
+- Always use **normalized coordinates between 0.0 and 1.0**.
+</web_form_guidelines>
+<task_resolution_example>
+For a task like "Sign up for an account and submit the form":
+Step 1:
+Short term goal: I want to open the signup page.
+What I see: The browser is open on the homepage.
+Reflection: I will open the signup URL directly.
+Action:
+```python
+open("https://example.com/signup")
+wait(3)
+```<end_code>
+Step 2:
+Short term goal: I want to fill the "Email" field.
+What I see: I see the signup form with an "Email" field roughly in the center-left of the screen.
+Reflection: I will click inside the field (approximately 0.47, 0.30 in normalized coordinates) then type my email.
+Action:
+```python
+click(0.47, 0.30)
+write("[email protected]")
+```<end_code>
+Step 3:
+Short term goal: I want to check the "I accept terms" checkbox.
+What I see: The checkbox is in the lower portion of the form, around 0.45, 0.55 in normalized coordinates.
+Reflection: I will click it.
+Action:
+```python
+click(0.45, 0.55)
+```<end_code>
+Step 4:
+Short term goal: I want to submit the form.
+What I see: The "Sign Up" button is at the bottom center, around 0.52, 0.65 in normalized coordinates.
+Reflection: I will click the button to submit.
+Action:
+```python
+click(0.52, 0.65)
+wait(3)
+```<end_code>
+Step 5:
+Short term goal: Verify signup completed.
+What I see: A confirmation page "Welcome [email protected]".
+Reflection: Task complete.
+Action:
+```python
+final_answer("Signup completed")
+```<end_code>
+</task_resolution_example>
+<general_guidelines>
+# GUI Agent Guidelines for Web Forms (Normalized Coordinates)
+## Environment Overview
+Ubuntu 22.04 XFCE4 desktop with Google Chrome/Chromium browser.
+Agent can fill forms, sign up, sign in, click checkboxes, submit claims.
+**All coordinates are normalized between 0.0 and 1.0.**
+## Core Principles
+### 1. Screenshot Analysis
+- Always analyze the latest screenshot carefully before each action.
+- Validate that previous actions worked by examining the current state.
+- If an action didn't work, try an alternative rather than repeating blindly.
+### 2. Action Execution
+- Execute one action at a time.
+- Wait for appropriate loading times using `wait()` but not indefinitely.
+- Scroll to bring hidden elements into view.
+### 3. Coordinate System
+- **CRITICAL**: Always use normalized coordinates (0.0 to 1.0)
+- Convert visual position on screen to normalized coordinates
+- Center of screen = (0.5, 0.5)
+- Top-left = (0.0, 0.0), Bottom-right = (1.0, 1.0)
+### 4. Keyboard Shortcuts
+- Use `tab` to move between fields, `space` to toggle checkboxes, `enter` to submit forms.
+- Copy/paste: `ctrl+C`, `ctrl+V`.
+- Refresh page: `refresh()`.
+### 5. Error Recovery
+- If clicking doesn't work, try double_click or right_click.
+- If typing doesn't appear, ensure the field is focused with click.
+- If popups block the screen, try `press("enter")` or `press("escape")`.
+### 6. Security & Privacy
+- Don't attempt to bypass captchas or 2FA automatically.
+- Don't store credentials in plain text unless instructed.
+### 7. Final Answer
+- When the form is successfully submitted or the goal achieved, use:
+```python
+final_answer("Done")
+```<end_code>
+</general_guidelines>
+"""

cua2-core/src/cua2-core/services/models/anthropic.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from smolagents import LiteLLMModel
+class AnthropicModel(LiteLLMModel):
+    """Anthropic model"""
+    MODEL_TYPE = "anthropic"
+    def __init__(self, model_id: str):
+        super().__init__(model_id=model_id)

cua2-core/src/cua2-core/services/models/gemini.py ADDED Viewed

File without changes

cua2-core/src/cua2-core/services/models/get_model.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from smolagents import Model
+from backend.models.models import AgentType
+from backend.services.models.anthropic import AnthropicModel
+def get_model(model_id: str) -> tuple[Model, AgentType]:
+    """Get the model"""
+    if "sonnet" in model_id:
+        return AnthropicModel(model_id=model_id), AgentType.PIXEL_COORDINATES
+    else:
+        raise ValueError(f"Model {model_id} not found")

cua2-core/src/cua2-core/services/models/qwen.py ADDED Viewed

File without changes

cua2-core/src/cua2-core/websocket/websocket_manager.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import asyncio
+import json
+from typing import Dict, Optional, Set
+from fastapi import WebSocket
+from backend.models.models import AgentMetadata, WebSocketEvent
+class WebSocketManager:
+    """Manages WebSocket connections and broadcasting"""
+    def __init__(self):
+        self.active_connections: Set[WebSocket] = set()
+        self.connection_tasks: Dict[WebSocket, asyncio.Task] = {}
+    async def connect(self, websocket: WebSocket):
+        """Accept a new WebSocket connection"""
+        await websocket.accept()
+        self.active_connections.add(websocket)
+        print(f"WebSocket connected. Total connections: {len(self.active_connections)}")
+    def disconnect(self, websocket: WebSocket):
+        """Remove a WebSocket connection"""
+        self.active_connections.discard(websocket)
+        if websocket in self.connection_tasks:
+            self.connection_tasks[websocket].cancel()
+            del self.connection_tasks[websocket]
+        print(
+            f"WebSocket disconnected. Total connections: {len(self.active_connections)}"
+        )
+    async def send_personal_message(
+        self, message: WebSocketEvent, websocket: WebSocket
+    ):
+        """Send a message to a specific WebSocket connection"""
+        try:
+            await websocket.send_text(json.dumps(message.model_dump()))
+        except Exception as e:
+            print(f"Error sending personal message: {e}")
+            # Only disconnect if the connection is still in our set
+            if websocket in self.active_connections:
+                self.disconnect(websocket)
+    async def broadcast(self, message: WebSocketEvent):
+        """Broadcast a message to all connected WebSockets"""
+        if not self.active_connections:
+            return
+        # Create a list of connections to remove if they fail
+        disconnected = []
+        for connection in self.active_connections.copy():
+            try:
+                await connection.send_text(json.dumps(message.model_dump()))
+            except Exception as e:
+                print(f"Error broadcasting to connection: {e}")
+                disconnected.append(connection)
+        # Remove failed connections
+        for connection in disconnected:
+            if connection in self.active_connections:
+                self.disconnect(connection)
+    async def send_agent_start(self, content: str, message_id: str):
+        """Send agent start event"""
+        event = WebSocketEvent(
+            type="agent_start", content=content, messageId=message_id
+        )
+        await self.broadcast(event)
+    async def send_agent_progress(self, content: str, message_id: str):
+        """Send agent progress event"""
+        event = WebSocketEvent(
+            type="agent_progress", content=content, messageId=message_id
+        )
+        await self.broadcast(event)
+    async def send_agent_complete(
+        self, content: str, message_id: str, metadata: Optional[AgentMetadata] = None
+    ):
+        """Send agent complete event"""
+        event = WebSocketEvent(
+            type="agent_complete",
+            content=content,
+            messageId=message_id,
+            metadata=metadata,
+        )
+        await self.broadcast(event)
+    async def send_agent_error(self, content: str, message_id: Optional[str] = None):
+        """Send agent error event"""
+        event = WebSocketEvent(
+            type="agent_error", content=content, messageId=message_id
+        )
+        await self.broadcast(event)
+    async def send_vnc_url_set(self, vnc_url: str, content: Optional[str] = None):
+        """Send VNC URL set event"""
+        event = WebSocketEvent(
+            type="vnc_url_set",
+            content=content or f"VNC stream available at: {vnc_url}",
+            vncUrl=vnc_url,
+        )
+        await self.broadcast(event)
+    async def send_vnc_url_unset(self, content: Optional[str] = None):
+        """Send VNC URL unset event (reset to default display)"""
+        event = WebSocketEvent(
+            type="vnc_url_unset",
+            content=content or "VNC stream disconnected, showing default display",
+        )
+        await self.broadcast(event)
+    def get_connection_count(self) -> int:
+        """Get the number of active connections"""
+        return len(self.active_connections)

cua2-front/.gitignore ADDED Viewed

	@@ -0,0 +1,24 @@

+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+node_modules
+dist
+dist-ssr
+*.local
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?

cua2-front/index.html ADDED Viewed

	@@ -0,0 +1,14 @@

+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <link rel="icon" type="image/svg+xml" href="/favicon.ico" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>CUA2</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.tsx"></script>
+  </body>
+</html>

cua2-front/package-lock.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cua2-front/package.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "name": "cua2-front",
+  "private": true,
+  "version": "0.0.0",
+  "type": "module",
+  "scripts": {
+    "dev": "vite",
+    "build": "vite build",
+    "build:dev": "vite build --mode development",
+    "lint": "eslint src/ --config src/eslint.config.js",
+    "type-check": "tsc --noEmit --project src/tsconfig.json",
+    "preview": "vite preview"
+  },
+  "dependencies": {
+    "react": "^18.3.1",
+    "react-router-dom": "^6.30.1",
+    "react-dom": "^18.3.1"
+  },
+  "devDependencies": {
+    "@eslint/js": "^9.32.0",
+    "@types/node": "^22.16.5",
+    "@types/react": "^18.3.23",
+    "@types/react-dom": "^18.3.7",
+    "@vitejs/plugin-react-swc": "^3.11.0",
+    "autoprefixer": "^10.4.21",
+    "eslint": "^9.32.0",
+    "eslint-plugin-react-hooks": "^5.2.0",
+    "eslint-plugin-react-refresh": "^0.4.20",
+    "globals": "^15.15.0",
+    "typescript-eslint": "^8.38.0",
+    "vite": "^5.4.19"
+  }
+}

cua2-front/src/App.tsx ADDED Viewed

	@@ -0,0 +1,15 @@

+import React from 'react';
+import { BrowserRouter, Routes, Route } from "react-router-dom";
+import Index from "./pages/Index";
+const App = () => (
+  <BrowserRouter>
+    <Routes>
+      <Route path="/" element={<Index />} />
+      {/* ADD ALL CUSTOM ROUTES ABOVE THE CATCH-ALL "*" ROUTE */}
+    </Routes>
+  </BrowserRouter>
+);
+export default App;

cua2-front/src/hooks/useWebSocket.ts ADDED Viewed

	@@ -0,0 +1,154 @@

+import { WebSocketEvent } from '@/types/agent';
+import { useCallback, useEffect, useRef, useState } from 'react';
+interface UseWebSocketProps {
+  url: string;
+  onMessage: (event: WebSocketEvent) => void;
+  onError?: (error: Event) => void;
+}
+export const useWebSocket = ({ url, onMessage, onError }: UseWebSocketProps) => {
+  const [isConnected, setIsConnected] = useState(false);
+  const [connectionState, setConnectionState] = useState<'connecting' | 'connected' | 'disconnected' | 'error'>('disconnected');
+  const wsRef = useRef<WebSocket | null>(null);
+  const reconnectTimeoutRef = useRef<NodeJS.Timeout>();
+  const reconnectAttemptsRef = useRef(0);
+  const maxReconnectAttempts = 3; // Only try three times, then stop
+  const baseReconnectDelay = 3000; // Start with 3 seconds
+  const maxReconnectDelay = 5000; // Max 5 seconds
+  const lastErrorTimeRef = useRef(0);
+  const errorThrottleMs = 5000; // Only show error toast once every 5 seconds
+  const isInitialConnectionRef = useRef(true); // Track if this is the first connection attempt
+  const getReconnectDelay = () => {
+    // Exponential backoff with jitter
+    const delay = Math.min(
+      baseReconnectDelay * Math.pow(2, reconnectAttemptsRef.current),
+      maxReconnectDelay
+    );
+    return delay + Math.random() * 1000; // Add jitter
+  };
+  const connect = useCallback(() => {
+    if (wsRef.current?.readyState === WebSocket.OPEN || wsRef.current?.readyState === WebSocket.CONNECTING) {
+      return; // Already connected or connecting
+    }
+    try {
+      setConnectionState('connecting');
+      const ws = new WebSocket(url);
+      ws.onopen = () => {
+        console.log('WebSocket connected');
+        setIsConnected(true);
+        setConnectionState('connected');
+        reconnectAttemptsRef.current = 0; // Reset attempts on successful connection
+        isInitialConnectionRef.current = false; // Mark that we've had a successful connection
+      };
+      ws.onmessage = (event) => {
+        try {
+          const data = JSON.parse(event.data) as WebSocketEvent;
+          onMessage(data);
+        } catch (error) {
+          console.error('Failed to parse WebSocket message:', error);
+        }
+      };
+      ws.onerror = (error) => {
+        console.error('WebSocket error:', error);
+        setConnectionState('error');
+        // Don't show error toasts on initial connection failure
+        // Only show toasts after we've had a successful connection before
+        if (!isInitialConnectionRef.current) {
+          // Throttle error notifications
+          const now = Date.now();
+          if (now - lastErrorTimeRef.current > errorThrottleMs) {
+            lastErrorTimeRef.current = now;
+            onError?.(error);
+          }
+        }
+      };
+      ws.onclose = (event) => {
+        console.log('WebSocket disconnected', { code: event.code, reason: event.reason });
+        setIsConnected(false);
+        setConnectionState('disconnected');
+        // Only attempt to reconnect if it wasn't a manual close (code 1000) and we haven't exceeded max attempts
+        if (event.code !== 1000 && reconnectAttemptsRef.current < maxReconnectAttempts) {
+          const delay = getReconnectDelay();
+          console.log(`Attempting to reconnect in ${Math.round(delay)}ms (attempt ${reconnectAttemptsRef.current + 1}/${maxReconnectAttempts})`);
+          reconnectTimeoutRef.current = setTimeout(() => {
+            reconnectAttemptsRef.current++;
+            connect();
+          }, delay);
+        } else if (reconnectAttemptsRef.current >= maxReconnectAttempts) {
+          console.log('Max reconnection attempts reached');
+          setConnectionState('error');
+        } else if (event.code === 1000) {
+          // Normal closure - don't reconnect
+          setConnectionState('disconnected');
+          console.log('WebSocket closed normally, not reconnecting');
+        }
+      };
+      wsRef.current = ws;
+    } catch (error) {
+      console.error('Failed to create WebSocket connection:', error);
+      setConnectionState('error');
+    }
+  }, [url, onMessage, onError]);
+  const disconnect = useCallback(() => {
+    if (reconnectTimeoutRef.current) {
+      clearTimeout(reconnectTimeoutRef.current);
+    }
+    if (wsRef.current) {
+      wsRef.current.close(1000, 'Manual disconnect');
+      wsRef.current = null;
+    }
+    setIsConnected(false);
+    setConnectionState('disconnected');
+    reconnectAttemptsRef.current = 0;
+  }, []);
+  const manualReconnect = useCallback(() => {
+    console.log('Manual reconnect requested');
+    disconnect();
+    reconnectAttemptsRef.current = 0;
+    isInitialConnectionRef.current = false; // Allow error toasts on manual reconnect
+    setTimeout(() => connect(), 1000); // Small delay before reconnecting
+  }, [disconnect, connect]);
+  const sendMessage = (message: unknown) => {
+    if (wsRef.current?.readyState === WebSocket.OPEN) {
+      try {
+        wsRef.current.send(JSON.stringify(message));
+      } catch (error) {
+        console.error('Failed to send WebSocket message:', error);
+      }
+    } else {
+      console.warn('WebSocket is not connected');
+    }
+  };
+  useEffect(() => {
+    connect();
+    return () => {
+      disconnect();
+    };
+  }, [url]); // Only depend on url, not the functions
+  return {
+    isConnected,
+    connectionState,
+    sendMessage,
+    reconnect: connect,
+    disconnect,
+    manualReconnect
+  };
+};

cua2-front/src/index.css ADDED Viewed

	@@ -0,0 +1,20 @@

+* {
+  margin: 0;
+  padding: 0;
+  box-sizing: border-box;
+}
+body {
+  margin: 0;
+  font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
+    'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
+    sans-serif;
+  -webkit-font-smoothing: antialiased;
+  -moz-osx-font-smoothing: grayscale;
+}
+#root {
+  width: 100%;
+  height: 100vh;
+}

cua2-front/src/main.tsx ADDED Viewed

	@@ -0,0 +1,5 @@

+import { createRoot } from "react-dom/client";
+import App from "./App.tsx";
+import "./index.css";
+createRoot(document.getElementById("root")!).render(<App />);

cua2-front/src/pages/Index.tsx ADDED Viewed

	@@ -0,0 +1,132 @@

+import React from 'react';
+import { useWebSocket } from '@/hooks/useWebSocket';
+import { AgentMessage, WebSocketEvent } from '@/types/agent';
+import { useEffect, useState } from 'react';
+const Index = () => {
+  const [messages, setMessages] = useState<AgentMessage[]>([]);
+  const [isAgentProcessing, setIsAgentProcessing] = useState(false);
+  const [vncUrl, setVncUrl] = useState<string>('');
+  // WebSocket connection - Use environment variable for flexibility across environments
+  // const WS_URL = process.env.NEXT_PUBLIC_WS_URL || 'ws://localhost:8000/ws';
+  const WS_URL = 'ws://localhost:8000/ws';
+  const handleWebSocketMessage = (event: WebSocketEvent) => {
+    console.log('WebSocket event received:', event);
+    switch (event.type) {
+      case 'agent_start':
+        setIsAgentProcessing(true);
+        if (event.content) {
+          const newMessage: AgentMessage = {
+            id: event.messageId,
+            type: 'agent',
+            instructions: event.instructions,
+            modelId: event.modelId,
+            timestamp: new Date(),
+            isLoading: true,
+          };
+          setMessages(prev => [...prev, newMessage]);
+        }
+        break;
+      case 'agent_progress':
+        if (event.messageId && event.agentStep) {
+          // Add new step from a agent trace run with image, generated text, actions, tokens and timestamp
+          setMessages(prev =>
+            prev.map(msg => {
+              if (msg.id === event.agentStep.messageId) {
+                const existingSteps = msg.steps || [];
+                const stepExists = existingSteps.some(step => step.stepId === event.agentStep.stepId);
+                if (!stepExists) {
+                  return { ...msg, steps: [...existingSteps, event.agentStep], isLoading: true };
+                }
+                return msg;
+              }
+              return msg;
+            })
+          );
+        }
+        break;
+      case 'agent_complete':
+        setIsAgentProcessing(false);
+        if (event.messageId && event.metadata) {
+          setMessages(prev =>
+            prev.map(msg =>
+              msg.id === event.metadata.messageId
+                ? {
+                  ...msg,
+                  isLoading: false,
+                  metadata: event.metadata,
+                }
+                : msg
+            )
+          );
+        }
+        break;
+      case 'agent_error':
+        setIsAgentProcessing(false);
+        // TODO: Handle agent error
+        break;
+      case 'vnc_url_set':
+        if (event.vncUrl) {
+          setVncUrl(event.vncUrl);
+        }
+        // TODO: Handle VNC URL set
+        break;
+      case 'vnc_url_unset':
+        setVncUrl('');
+        // TODO: Handle VNC URL unset
+        break;
+      case 'heartbeat':
+        console.log('Heartbeat received:', event);
+        break;
+    }
+  };
+  const handleWebSocketError = () => {
+    // Error handling is now throttled in the WebSocket hook
+  };
+  const { isConnected, connectionState, sendMessage, manualReconnect } = useWebSocket({
+    url: WS_URL,
+    onMessage: handleWebSocketMessage,
+    onError: handleWebSocketError,
+  });
+  const handleSendMessage = (content: string) => {
+    const userMessage: AgentMessage = {
+      id: Date.now().toString(),
+      type: 'user',
+      content,
+      timestamp: new Date(),
+    };
+    setMessages(prev => [...prev, userMessage]);
+    // Send message to Python backend via WebSocket
+    sendMessage({
+      type: 'user_task',
+      content,
+      model_id: "anthropic/claude-sonnet-4-5-20250929",
+      timestamp: new Date().toISOString(),
+    });
+  };
+  return (
+    <div>
+      <h1>Hello World</h1>
+    </div>
+  );
+};
+export default Index;

cua2-front/src/types/agent.ts ADDED Viewed

	@@ -0,0 +1,36 @@

+export interface AgentMessage {
+  id: string;
+  type: 'user' | 'agent';
+  timestamp: Date;
+  instructions: string;
+  modelId: string;
+  steps?: AgentStep[];
+  metadata?: AgentMetadata;
+  isLoading?: boolean;
+}
+export interface AgentStep {
+  messageId: string;
+  stepId: string;
+  image: string;
+  generatedText: string;
+  actions: string[];
+  inputTokensUsed: number;
+  outputTokensUsed: number;
+  timestamp: Date;
+}
+export interface AgentMetadata {
+  messageId: string;
+  inputTokensUsed: number;
+  outputTokensUsed: number;
+  timeTaken: number;
+  numberOfSteps: number;
+}
+export interface WebSocketEvent {
+  type: 'agent_start' | 'agent_progress' | 'agent_complete' | 'agent_error' | 'vnc_url_set' | 'vnc_url_unset' | 'heartbeat';
+  agentStep?: AgentStep;
+  metadata?: AgentMetadata;
+  vncUrl?: string;
+}

cua2-front/src/vite-env.d.ts ADDED Viewed

	@@ -0,0 +1 @@


1	+ /// <reference types="vite/client" />

cua2-front/tsconfig.app.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "compilerOptions": {
+    "target": "ES2020",
+    "useDefineForClassFields": true,
+    "lib": [
+      "ES2020",
+      "DOM",
+      "DOM.Iterable"
+    ],
+    "module": "ESNext",
+    "skipLibCheck": true,
+    /* Bundler mode */
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "isolatedModules": true,
+    "moduleDetection": "force",
+    "noEmit": true,
+    "jsx": "react-jsx",
+    /* Linting */
+    "strict": false,
+    "noUnusedLocals": false,
+    "noUnusedParameters": false,
+    "noImplicitAny": false,
+    "noFallthroughCasesInSwitch": false,
+    "baseUrl": ".",
+    "paths": {
+      "@/*": [
+        "./src/*"
+      ]
+    }
+  },
+  "include": [
+    "src",
+  ]
+}

cua2-front/tsconfig.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "files": [],
+  "references": [{ "path": "./tsconfig.app.json" }, { "path": "./tsconfig.node.json" }],
+  "compilerOptions": {
+    "baseUrl": ".",
+    "paths": {
+      "@/*": ["./src/*"]
+    },
+    "noImplicitAny": false,
+    "noUnusedParameters": false,
+    "skipLibCheck": true,
+    "allowJs": true,
+    "noUnusedLocals": false,
+    "strictNullChecks": false
+  }
+}

cua2-front/tsconfig.node.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "lib": ["ES2023"],
+    "module": "ESNext",
+    "skipLibCheck": true,
+    /* Bundler mode */
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "isolatedModules": true,
+    "moduleDetection": "force",
+    "noEmit": true,
+    /* Linting */
+    "strict": true,
+    "noUnusedLocals": false,
+    "noUnusedParameters": false,
+    "noFallthroughCasesInSwitch": true
+  },
+  "include": ["vite.config.ts"]
+}

cua2-front/vite.config.ts ADDED Viewed

	@@ -0,0 +1,17 @@

+import { defineConfig } from "vite";
+import react from "@vitejs/plugin-react-swc";
+import path from "path";
+// https://vitejs.dev/config/
+export default defineConfig(({ mode }) => ({
+  server: {
+    host: "::",
+    port: 8080,
+  },
+  plugins: [react()],
+  resolve: {
+    alias: {
+      "@": path.resolve(__dirname, "./src"),
+    },
+  },
+}));